# 1.2 Ethics & Discretion of Machine Leanring Programmes

## Contents

### 01. Importing Libraries

### 02. Importing Data

### 03. Scaling

### 04. Export as .csv

#### **01. Importing Libraries**

In [16]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler

In [17]:
# Create path
path = r'/Users/yaseminmustafa/Desktop/CareerFoundry/Specialisation'

#### **02. Importing Data**

In [18]:
df = pd.read_csv(os.path.join(path, '1.1', 'Climate.csv'))

In [19]:
# Check data
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [20]:
# Check shape 
df.shape

(22950, 170)

In [21]:
# Reset index and rename the index column to 'id' for clarity
df.reset_index(inplace=True)
df.rename(columns={'index':'id'}, inplace=True)

# Check data
df.head()

Unnamed: 0,id,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [22]:
# Display summary information about the DF
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22950 entries, 0 to 22949
Columns: 171 entries, id to VALENTIA_temp_max
dtypes: float64(145), int64(26)
memory usage: 29.9 MB


#### **03. Scaling**

#####  **- Check for (and handle) missing values**

In [23]:
# Check how many missing values are in each column of the subset DF
df.isnull().sum()

id                     0
DATE                   0
MONTH                  0
BASEL_cloud_cover      0
BASEL_wind_speed       0
                      ..
VALENTIA_snow_depth    0
VALENTIA_sunshine      0
VALENTIA_temp_mean     0
VALENTIA_temp_min      0
VALENTIA_temp_max      0
Length: 171, dtype: int64

##### **No missing data**

In [24]:
# Check shape
df.shape

(22950, 171)

#### **- Scaling**

In [25]:
# Create scaler object using StandardScaler from sklearn.preprocessing
# StandardScaler assumes data is normally distributed and scales with a distribution
# around 0 and standard deviation of 1. Scaling happens independently with each variable.
scaler = StandardScaler()

In [26]:
# Check the shape and columns in df
print(df.shape)
print(df.columns.tolist())

(22950, 171)
['id', 'DATE', 'MONTH', 'BASEL_cloud_cover', 'BASEL_wind_speed', 'BASEL_humidity', 'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation', 'BASEL_snow_depth', 'BASEL_sunshine', 'BASEL_temp_mean', 'BASEL_temp_min', 'BASEL_temp_max', 'BELGRADE_cloud_cover', 'BELGRADE_humidity', 'BELGRADE_pressure', 'BELGRADE_global_radiation', 'BELGRADE_precipitation', 'BELGRADE_sunshine', 'BELGRADE_temp_mean', 'BELGRADE_temp_min', 'BELGRADE_temp_max', 'BUDAPEST_cloud_cover', 'BUDAPEST_humidity', 'BUDAPEST_pressure', 'BUDAPEST_global_radiation', 'BUDAPEST_precipitation', 'BUDAPEST_sunshine', 'BUDAPEST_temp_mean', 'BUDAPEST_temp_min', 'BUDAPEST_temp_max', 'DEBILT_cloud_cover', 'DEBILT_wind_speed', 'DEBILT_humidity', 'DEBILT_pressure', 'DEBILT_global_radiation', 'DEBILT_precipitation', 'DEBILT_sunshine', 'DEBILT_temp_mean', 'DEBILT_temp_min', 'DEBILT_temp_max', 'DUSSELDORF_cloud_cover', 'DUSSELDORF_wind_speed', 'DUSSELDORF_humidity', 'DUSSELDORF_pressure', 'DUSSELDORF_global_radiati

In [27]:
# Identify columns to leave unscaled
unscaled_columns = ['id', 'DATE', 'MONTH']

In [28]:
# Identify columns to scale (i.e., everything else)
columns_to_scale = [col for col in df.columns if col not in unscaled_columns]

In [29]:
# Scale only the appropriate columns
scaled_data = scaler.fit_transform(df[columns_to_scale])
df_scaled_numeric = pd.DataFrame(scaled_data, columns=columns_to_scale)# Scale only the appropriate columns

In [30]:
# Combine scaled data with unscaled columns
df_scaled = pd.concat([df[unscaled_columns].reset_index(drop=True),
                       df_scaled_numeric.reset_index(drop=True)], axis=1)

In [31]:
# Rearrange columns for the original order
df_scaled = df_scaled[unscaled_columns + columns_to_scale]

In [32]:
# Compare the original vs. scaled data
df.head()

Unnamed: 0,id,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [33]:
# Check standard deviation of df_scaled
print(df_scaled.describe())

                 id          DATE         MONTH  BASEL_cloud_cover  \
count  22950.000000  2.295000e+04  22950.000000       2.295000e+04   
mean   11474.500000  1.990984e+07      6.509630      -1.975278e-16   
std     6625.238675  1.813833e+05      3.443672       1.000022e+00   
min        0.000000  1.960010e+07      1.000000      -2.248803e+00   
25%     5737.250000  1.975092e+07      4.000000      -5.863365e-01   
50%    11474.500000  1.991060e+07      7.000000       2.448969e-01   
75%    17211.750000  2.007021e+07      9.000000       6.605137e-01   
max    22949.000000  2.022103e+07     12.000000       1.076130e+00   

       BASEL_wind_speed  BASEL_humidity  BASEL_pressure  \
count      2.295000e+04    2.295000e+04    2.295000e+04   
mean      -8.817541e-16    6.935145e-16   -1.690442e-15   
std        1.000022e+00    1.000022e+00    1.000022e+00   
min       -2.894398e+00   -3.690749e+00   -6.619945e+00   
25%       -2.793014e-02   -7.096309e-01   -3.381979e-01   
50%       -2.79

In [34]:
# Check standard deviation of original data
print(df.describe())

                 id          DATE         MONTH  BASEL_cloud_cover  \
count  22950.000000  2.295000e+04  22950.000000       22950.000000   
mean   11474.500000  1.990984e+07      6.509630           5.410763   
std     6625.238675  1.813833e+05      3.443672           2.406115   
min        0.000000  1.960010e+07      1.000000           0.000000   
25%     5737.250000  1.975092e+07      4.000000           4.000000   
50%    11474.500000  1.991060e+07      7.000000           6.000000   
75%    17211.750000  2.007021e+07      9.000000           7.000000   
max    22949.000000  2.022103e+07     12.000000           8.000000   

       BASEL_wind_speed  BASEL_humidity  BASEL_pressure  \
count      22950.000000    22950.000000    22950.000000   
mean           2.120462        0.758554        1.018013   
std            0.732625        0.110699        0.006543   
min            0.000000        0.350000        0.974700   
25%            2.100000        0.680000        1.015800   
50%            

#### **04. Export as .csv**

In [36]:
df_scaled.to_csv(os.path.join(path, '1.2', '[DATASET]_scaled.csv'), index=False)