In [None]:
#Import Libraries

from pyspark.sql import SparkSession

# Create Spark Context
from pyspark.sql import functions as func # To use 'sum', 'count', and other functions
from pyspark.sql.types import IntegerType, DoubleType

### Pandas vs PySpark
The complexity of Data Processing Tasks: PySpark is more suitable for complex data processing tasks that involve multiple stages of data transformation and analysis. Pandas is more suitable for simple data analysis tasks that involve filtering, selecting, and aggregating data.

In [None]:
import pandas as pd # Python library to manage dataframes, similar as PySpark

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
bank_data = spark.read.option('header','true').options(delimiter=";").csv('bank.csv', inferSchema=True)
bank_data.show()

# NOTE:
# What is inferSchema=True???
# Infer schema will automatically guess the data types for each field. If we set this option to TRUE, the API will read some sample records from the file to infer the schema. If we want to set this value to false, we must specify a schema explicitly

In [None]:
# Showing the type of each column

bank_data.printSchema()

In [None]:
# Showing the first 5 rows of data

bank_data.head(5)

## TASK 1

### Calculate the Mean, Median, and Standard Deviation of all the variables/attributes of numeric type:

**Getting a Database:** Once you have a connected aninstance of MngoClient, you can access any database managed by the specified MngoDB server. To define which database you want to use, you can use the dot notation.

### Method 1

### Using 'describe' function to provide the basic statisticsa

In [None]:
bank_data.select('age', 'balance', 'day', 'duration', 'pdays', 'previous').describe().show()

### Method 2

In [None]:
bank_data.select(func.stddev('age')).show()

In [None]:
bank_data.select(func.mean('age')).show()

In [None]:
bank_data.select(func.max('age')).show()

In [None]:
bank_data.select(func.min('age')).show()

In [None]:
#Mean of all columns
bank_data.select([func.mean(c) for c in bank_data.columns]).first()

### Task 2

### Create a bar graph of the variables/attribute: Previous

#### We can  use the built-in functionality of Pandas to draw a chart

### Notes

ascending = False means from highest to lowest

groupBy: It is used for grouping the data points (i.e. rows) based on the distinct values in the given column or columns. We can then calculate aggregated values for the generated groups.

orderBy: Sorting


In [None]:
# Create a new dataframe in which the rows are groups based on the distict values of 'job' column
df = bank_data.groupBy('job').sum().orderBy("sum(previous)", ascending=False)
df.head(10)

#### Method 1: Using Pandas to graph

In [None]:
# Convert the PySpark dataframe to Pandas dataframe ( to be able to use the Pandas chart drawing functionality)

df.toPandas().plot.bar(x='job', y='sum(previous)')

#### Method 2: Using Matplotlib to graph

In [None]:
# Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python.

import matplotlib.pyplot as plt

# Create a bar chart using Matplotlib
df_pan = df.toPandas() #Convert the PySpark dataframe to Pandas dataframe

plt.bar(df_pan['job'], df_pan['sum(previous)'], width=0.5, color="orange")
 
# Set the chart title and labels
plt.title('Plot Title')
plt.xlabel('job')
plt.ylabel('sum(previous)')

plt.xticks(rotation = 90)

plt.show()

## Task 3

#### Create a Normalised bar graph of the variable/attribute: Previous

### Normalization

**Normalization:** The goal of normalization is to transform features to be on a similar scale. This improves the performance and training stability of the model.

Add Normalised columns to the inout dataframe.

#### Unsing the min-ax feature scaling

The min-max approach (often called **Normalization**) rescales the feature to a hard and fast range of [0,1] by subtracting the minimum value of the feature then dividing by the range. We can apply the min-max scaling in Pandas using the `.min()` and `.max()` methods.

$$
 \frac{x-min}{max-min}
$$

In [None]:
df_result = df.toPandas()
df_result["sum(previous)_norm"] = (df_result["sum(previous)"] - df_result["sum(previous)"].min()) / (df_result["sum(previous)"].max()-df_result["sum(previous)"].min())

In [None]:
df_result

In [None]:
df_result.plot.bar(x='job', y='sum(previous)_norm')

### Standardization

#### Using z-score method:

The z-score method (often called **Standardization**) transforms the info into a distribution with mean of 0 and the standard deviation of 1. each standardized value is computed by subtracting the mean of the corresponding feature whn dividing by the sandard deviaiton.

$$
 \frac{x-mean}{stdev}
$$

In [None]:
df_result["sum(previous)_std"] = (df_result["sum(previous)"] - df_result["sum(previous)"].mean()) / (df_result["sum(previous)"].std())

In [None]:
df_result

In [None]:
df_result.plot.bar(x='job', y='sum(previous)_std')

In [None]:
df_result["sum(previous)_std"].mean()

In [None]:
df_result["sum(previous)_std"].std()

### Task 4

#### Create a histogram of the variable/attribute: age

In [None]:
df = bank_data.select('age') # Selecting the column 'age'
df.toPandas().hist(column='age') # Converting the dataframe to Pandas and then draw a histogram

### Task 5

#### Create a histogram of the normalised variable/attribute: age

In [None]:
df_result = df.toPandas()
df_result["age_norm"] = (df_result["age"] - df_result["age"].min()) / (df_result["age"].max()-df_result["age"].min())
df_result

In [None]:
df_result.hist(column='age_norm')

### Task 6

#### Bin (groupby) the variable/attribute age and create a bar chart

In [None]:
df=bank_data.groupBy('job').agg(func.mean('age')).orderBy('avg(age)', ascending=False)
df.toPandas().plot.bar(x='job', y='avg(age)')

### Task 7

#### Create a scatter plot for the following variable/attribute: age and balance

In [None]:
df = bank_data.select('age', 'balance')
df.toPandas().plot.scatter(x='age', y='balance')

## Cleaning Data

Consider a scenario in which the owner of the data set decides that any data under the ‘Job’ column that contains ‘unknown’ or ‘unemployed’ data should be considered inaccurate. Thus, those data should be removed from the dataset before data exploration.

As part of this data removal operation (i.e., the data cleaning operation), you are requested to get rid of all the data rows in which the ‘Job’ column contains ‘unknown’ or ‘unemployed’ instead of data. You can remove these either using PySpark or Excel. Once these data have been removed, save the remaining data in a csv file named bank.csv as before.

#### Filtering

In [None]:
# Using 'where' to filter the data
bank_data.where(bank_data.job=='unemployed').show() 

In [None]:
# Using 'and' (&) 'or' (|) to apply multiple conditions 
bank_data.where((bank_data.job=='unknown') | (bank_data.job=='unemployed')).show() 

#### No of rows

In [None]:
bank_data.count()

In [None]:
bank_data.where((bank_data.job=='unknown') | (bank_data.job=='unemployed')).count()

#### Removing the unwanted rows

In [None]:
df_cleaned = bank_data.where((bank_data.job!='unknown') & (bank_data.job!='unemployed'))

In [None]:
df_cleaned.count()

### Finding Duplicated Rows

In [None]:
df = bank_data.toPandas()
duplicated_rows = df[df.duplicated()]
print(duplicated_rows)


#### Writing the cleaned dataframe to a csv file

In [None]:
df_cleaned.write.csv("bank_data_cleaned.csv")