# TODO
- try combinations
- test already written functions
- try other methods
- remove unused functions, options
- knn imputation seems too slow, unusable
- split (train, test) before everything (imputation, sampling, etc.)

In [1]:
from preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import itertools
import time

In [2]:
def train_test_model(model, X_train, X_test, y_train, y_test, average='weighted', verbose=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average=average)
    rec = recall_score(y_test, y_pred, average=average)
    f1 = f1_score(y_test, y_pred, average=average)

    if verbose:
        print(f"Accuracy: {acc}")
        print(f"Precision: {prec}")
        print(f"Recall: {rec}")
        print(f"F1: {f1}")
        print()
    return acc, prec, rec, f1


In [3]:
P = Preprocessor()
df = P.load_data('weatherAUS.csv')

In [4]:
print(P.imputation_types())
print(P.outlier_types())
print(P.encoding_types())
print(P.scaling_types())
print(P.discretize_types())
print(P.sampling_types())


['mice', 'mean', 'median', 'mode', 'mode_', 'drop', 'knn', 'interpolate', 'mode_mean', 'mode_median', 'mode_knn', 'mode_interpolate', 'mode_mode', 'None']
['mean', 'median', 'mode', 'drop', 'None']
['onehot', 'ordinal', 'label', 'None']
['standard', 'minmax', 'robust', 'None']
['equal_width', 'equal_freq', 'None']
['smote', 'random', 'smotenc', 'smoten', 'under', 'sklearn', 'None']


# Steps
- Encode RainToday, RainTomorrow (0, 1)
- Resample (sklearn)
- Imputation (mode_) categorical
- Encoding (label)
- Imputation (mice) numerical
- Outliers (drop)
- Scale (standard) for X_train

In [5]:
combination = {
    'imputation': 'mode_', # categorical: mode
    'imputation2': 'mice', # numeric: mice
    'outlier': 'drop',
    'encoding': 'label',
    'scaling': 'standard',
    'discritization': 'equal_width',
    'sampling': 'sklearn',
}

In [6]:
df_p = df.copy()

In [7]:
# convert df['RainToday'], df['RainTomorrow'] from 'Yes'/'No' to 1/0
df_p['RainToday'] = df_p['RainToday'].map({'Yes': 1, 'No': 0})
df_p['RainTomorrow'] = df_p['RainTomorrow'].map({'Yes': 1, 'No': 0})
df_p

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,0.0,0.0
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,0.0,0.0
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,0.0,0.0
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,0.0,0.0


In [8]:
df_p = P.sample(df_p, combination['sampling'])
df_p

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,2013-03-21,Albury,14.9,20.2,2.6,,,N,67.0,NNE,...,81.0,82.0,1015.7,1009.7,8.0,8.0,17.8,18.6,1.0,1.0
77215,2015-05-14,Portland,8.4,14.1,2.8,0.6,2.9,WSW,41.0,SSW,...,82.0,83.0,1033.9,1034.4,8.0,8.0,12.2,13.1,1.0,1.0
57755,2016-06-04,Ballarat,7.5,13.1,5.2,,,SE,35.0,SE,...,100.0,95.0,1016.8,1012.3,8.0,8.0,10.0,12.4,1.0,1.0
65977,2014-02-19,MelbourneAirport,15.5,25.7,0.0,7.2,9.3,W,67.0,NW,...,71.0,32.0,999.2,996.9,4.0,6.0,17.9,23.8,0.0,1.0


In [9]:
df_p = P.impute(df_p, combination['imputation'])
df_p

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,2013-03-21,Albury,14.9,20.2,2.6,,,N,67.0,NNE,...,81.0,82.0,1015.7,1009.7,8.0,8.0,17.8,18.6,1.0,1.0
77215,2015-05-14,Portland,8.4,14.1,2.8,0.6,2.9,WSW,41.0,SSW,...,82.0,83.0,1033.9,1034.4,8.0,8.0,12.2,13.1,1.0,1.0
57755,2016-06-04,Ballarat,7.5,13.1,5.2,,,SE,35.0,SE,...,100.0,95.0,1016.8,1012.3,8.0,8.0,10.0,12.4,1.0,1.0
65977,2014-02-19,MelbourneAirport,15.5,25.7,0.0,7.2,9.3,W,67.0,NW,...,71.0,32.0,999.2,996.9,4.0,6.0,17.9,23.8,0.0,1.0


In [10]:
df_p = P.encode(df_p, combination['encoding'])
df_p

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,396,2,13.4,22.9,0.6,,,13,44.0,13,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0
1,397,2,7.4,25.1,0.0,,,14,44.0,6,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0
2,398,2,12.9,25.7,0.0,,,15,46.0,13,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0
3,399,2,9.2,28.0,0.0,,,4,24.0,9,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0
4,400,2,17.5,32.3,1.0,,,13,41.0,1,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,1878,2,14.9,20.2,2.6,,,3,67.0,5,...,81.0,82.0,1015.7,1009.7,8.0,8.0,17.8,18.6,1.0,1.0
77215,2662,33,8.4,14.1,2.8,0.6,2.9,15,41.0,11,...,82.0,83.0,1033.9,1034.4,8.0,8.0,12.2,13.1,1.0,1.0
57755,3049,5,7.5,13.1,5.2,,,9,35.0,9,...,100.0,95.0,1016.8,1012.3,8.0,8.0,10.0,12.4,1.0,1.0
65977,2213,19,15.5,25.7,0.0,7.2,9.3,13,67.0,7,...,71.0,32.0,999.2,996.9,4.0,6.0,17.9,23.8,0.0,1.0


In [11]:
df_p = P.impute(df_p, combination['imputation2'])
df_p



Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,396.0,2.0,13.4,22.9,0.6,5.758410,7.407814,13.0,44.0,13.0,...,71.0,22.0,1007.7,1007.1,8.000000,4.847870,16.9,21.8,0.0,0.0
1,397.0,2.0,7.4,25.1,0.0,5.639122,11.496005,14.0,44.0,6.0,...,44.0,25.0,1010.6,1007.8,1.734548,2.618758,17.2,24.3,0.0,0.0
2,398.0,2.0,12.9,25.7,0.0,7.770166,12.339338,15.0,46.0,13.0,...,38.0,30.0,1007.6,1008.7,1.843146,2.000000,21.0,23.2,0.0,0.0
3,399.0,2.0,9.2,28.0,0.0,5.992142,11.766955,4.0,24.0,9.0,...,45.0,16.0,1017.6,1012.8,1.257526,2.118895,18.1,26.5,0.0,0.0
4,400.0,2.0,17.5,32.3,1.0,6.847603,5.683229,13.0,41.0,1.0,...,82.0,33.0,1010.8,1006.0,7.000000,8.000000,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220627,1878.0,2.0,14.9,20.2,2.6,2.399494,1.503404,3.0,67.0,5.0,...,81.0,82.0,1015.7,1009.7,8.000000,8.000000,17.8,18.6,1.0,1.0
220628,2662.0,33.0,8.4,14.1,2.8,0.600000,2.900000,15.0,41.0,11.0,...,82.0,83.0,1033.9,1034.4,8.000000,8.000000,12.2,13.1,1.0,1.0
220629,3049.0,5.0,7.5,13.1,5.2,-0.809179,0.869035,9.0,35.0,9.0,...,100.0,95.0,1016.8,1012.3,8.000000,8.000000,10.0,12.4,1.0,1.0
220630,2213.0,19.0,15.5,25.7,0.0,7.200000,9.300000,13.0,67.0,7.0,...,71.0,32.0,999.2,996.9,4.000000,6.000000,17.9,23.8,0.0,1.0


In [12]:
df_p = P.outlier(df_p, combination['outlier'])
df_p

After dropping outliers:  (170731, 23)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,396.0,2.0,13.4,22.9,0.6,5.758410,7.407814,13.0,44.0,13.0,...,71.0,22.0,1007.700000,1007.100000,8.000000,4.847870,16.9,21.8,0.0,0.0
1,397.0,2.0,7.4,25.1,0.0,5.639122,11.496005,14.0,44.0,6.0,...,44.0,25.0,1010.600000,1007.800000,1.734548,2.618758,17.2,24.3,0.0,0.0
2,398.0,2.0,12.9,25.7,0.0,7.770166,12.339338,15.0,46.0,13.0,...,38.0,30.0,1007.600000,1008.700000,1.843146,2.000000,21.0,23.2,0.0,0.0
3,399.0,2.0,9.2,28.0,0.0,5.992142,11.766955,4.0,24.0,9.0,...,45.0,16.0,1017.600000,1012.800000,1.257526,2.118895,18.1,26.5,0.0,0.0
4,400.0,2.0,17.5,32.3,1.0,6.847603,5.683229,13.0,41.0,1.0,...,82.0,33.0,1010.800000,1006.000000,7.000000,8.000000,17.8,29.7,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220626,2237.0,23.0,8.8,15.7,0.0,1.792403,1.807538,7.0,56.0,7.0,...,99.0,87.0,1016.637213,1013.852659,7.341562,7.432745,9.9,14.4,0.0,1.0
220627,1878.0,2.0,14.9,20.2,2.6,2.399494,1.503404,3.0,67.0,5.0,...,81.0,82.0,1015.700000,1009.700000,8.000000,8.000000,17.8,18.6,1.0,1.0
220629,3049.0,5.0,7.5,13.1,5.2,-0.809179,0.869035,9.0,35.0,9.0,...,100.0,95.0,1016.800000,1012.300000,8.000000,8.000000,10.0,12.4,1.0,1.0
220630,2213.0,19.0,15.5,25.7,0.0,7.200000,9.300000,13.0,67.0,7.0,...,71.0,32.0,999.200000,996.900000,4.000000,6.000000,17.9,23.8,0.0,1.0


In [13]:
X_train, X_test, y_train, y_test = P.split(df_p)

In [14]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

136584
34147
136584
34147


In [15]:
print(type(X_train))
print(type(y_train))
print(type(X_test))
print(type(y_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [16]:
print(type(X_train))
print(type(y_train))
print(type(X_test))
print(type(y_test))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [17]:
X_train.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
215198,1105.0,39.0,22.4,28.8,4.4,7.4,0.8,2.0,39.0,9.0,...,24.0,70.0,72.0,1016.4,1012.8,7.0,7.0,26.3,27.1,1.0
132098,1224.0,6.0,14.8,20.0,1.0,1.897917,1.666222,4.0,28.0,3.0,...,13.0,98.0,94.0,1013.1,1011.9,8.0,8.0,18.1,19.7,0.0
88001,2939.0,29.0,17.8,34.7,0.0,10.390651,12.1,0.0,65.0,2.0,...,31.0,46.0,22.0,1020.1,1015.8,2.173272,2.244744,23.0,33.4,0.0
201383,1270.0,21.0,9.3,15.5,0.2,1.8,5.1,15.0,50.0,15.0,...,30.0,89.0,60.0,1009.4,1007.4,6.0,6.0,11.6,14.7,0.0
53812,1547.0,20.0,22.1,38.3,0.0,9.2,12.0,3.0,44.0,5.0,...,26.0,37.0,14.0,1016.6,1013.1,0.0,1.0,28.3,37.3,0.0


In [18]:
X_test.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
60953,3368.0,44.0,11.6,26.9,0.0,3.4,7.3,1.0,19.0,1.0,...,9.0,100.0,41.0,1028.3,1024.4,7.0,7.0,13.5,26.4,0.0
178017,1346.0,35.0,4.3,12.3,0.2,1.8,1.2,13.0,20.0,13.0,...,11.0,90.0,74.0,1011.3,1009.3,7.0,7.0,8.8,11.4,0.0
135346,855.0,33.0,14.0,19.6,0.0,5.4,7.4,0.0,28.0,1.0,...,7.0,91.0,96.0,1006.4,1005.1,7.0,8.0,17.1,19.3,0.0
26371,1620.0,38.0,10.5,22.0,0.0,1.2,10.2,13.0,33.562793,7.0,...,11.0,52.0,38.0,1018.6,1016.2,1.0,1.0,16.1,21.9,0.0
219339,1365.0,46.0,9.1,17.8,1.8,1.361173,3.699573,13.0,35.0,7.0,...,17.0,99.0,76.0,1023.3,1020.5,6.14555,6.435942,12.7,15.4,1.0


In [19]:
cols = X_train.columns.tolist()
cols.append('RainTomorrow')

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

train = np.concatenate((X_train, y_train.values.reshape(-1,1)), axis=1)
train = pd.DataFrame(train, columns=cols)
# train.to_csv('train.csv', index=False)

test = np.concatenate((X_test, y_test.values.reshape(-1,1)), axis=1)
test = pd.DataFrame(test, columns=cols)
# test.to_csv('test.csv', index=False)

In [20]:
X_train = train.drop('RainTomorrow', axis=1)
y_train = train['RainTomorrow']
X_test = test.drop('RainTomorrow', axis=1)
y_test = test['RainTomorrow']

In [21]:
X_train = P.scale(X_train, combination['scaling'])
X_test = P.scale(X_test, combination['scaling'])

In [22]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=100),
    LogisticRegression(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    GaussianNB()
]

In [23]:
s_t = time.time()
print('Combination:', combination)
for model in models:
    print(model)
    acc, prec, rec, f1 = train_test_model(model, X_train, X_test, y_train, y_test, verbose=True)
    with open('result.log', 'a') as f:
        f.write(f"{model.__class__.__name__}:\n")
        f.write(f"Combination: {combination}\n")
        f.write(f"Accuracy: {acc}\n")
        f.write(f"Precision: {prec}\n")
        f.write(f"Recall: {rec}\n")
        f.write(f"F1: {f1}\n")
        f.write('\n')
    print(f"Time taken: {time.time() - s_t}")
    s_t = time.time()

    print()
with open('result.log', 'a') as f:
    f.write('-'*50 + '\n')

Combination: {'imputation': 'mode_', 'imputation2': 'mice', 'outlier': 'drop', 'encoding': 'label', 'scaling': 'standard', 'discritization': 'equal_width', 'sampling': 'sklearn'}
RandomForestClassifier(max_depth=100)
Accuracy: 0.9622221571441122
Precision: 0.9630318940036199
Recall: 0.9622221571441122
F1: 0.9622913696200323

Time taken: 36.425750494003296

LogisticRegression()
Accuracy: 0.795531086186195
Precision: 0.7951306248814657
Recall: 0.795531086186195
F1: 0.7946634533779091

Time taken: 0.47051072120666504

DecisionTreeClassifier()
Accuracy: 0.9037982838902393
Precision: 0.9060437101741322
Recall: 0.9037982838902393
F1: 0.9040535570260345

Time taken: 2.202206611633301

KNeighborsClassifier()
Accuracy: 0.846340820569889
Precision: 0.8523235260076757
Recall: 0.846340820569889
F1: 0.8468578399719089

Time taken: 7.794291973114014

GaussianNB()
Accuracy: 0.7564061264532755
Precision: 0.7568714983094121
Recall: 0.7564061264532755
F1: 0.7565994414675461

Time taken: 0.09708023071289

# PySpark MLlib

In [24]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,RainTomorrow
0,-0.927162,1.087925,1.627074,0.913995,2.798731,1.001441,-1.678797,-1.294296,-0.020479,0.469543,...,-0.060146,0.799394,-0.179996,-0.355822,0.934941,0.900072,1.528585,0.919505,2.11207,1.0
1,-0.792711,-1.25101,0.420976,-0.399549,0.256309,-0.990904,-1.4343,-0.866204,-0.92581,-0.87193,...,1.578757,1.873681,-0.710004,-0.499693,1.340364,1.330897,0.217506,-0.2072,-0.479827,1.0
2,1.144952,0.379156,0.897067,1.794667,-0.491462,2.084378,1.510693,-1.722389,2.119395,-1.095509,...,-1.46492,-1.642168,0.414256,0.123747,-1.021924,-1.148613,1.000956,1.878726,-0.479827,0.0
3,-0.740739,-0.187858,-0.451858,-1.071248,-0.341908,-1.02636,-0.465097,1.488305,0.884853,1.811015,...,1.051967,0.213419,-1.304256,-1.219046,0.529519,0.469246,-0.821763,-0.968487,-0.479827,1.0
4,-0.427775,-0.258735,1.579464,2.332026,-0.491462,1.653235,1.482468,-1.08025,0.391036,-0.424772,...,-1.99171,-2.032818,-0.147874,-0.307865,-1.903017,-1.684881,1.848361,2.47253,-0.479827,0.0


In [25]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


import findspark
findspark.init()

spark = SparkSession.builder.appName('RF').getOrCreate()

In [26]:
# convert the dataframe to spark dataframe
train_spark = spark.createDataFrame(train)
test_spark = spark.createDataFrame(test)

input_cols = train_spark.columns[:-1]
output_col = train_spark.columns[-1]

featureassemble = VectorAssembler(inputCols=input_cols, outputCol='features')
output = featureassemble.transform(train_spark)
testoutput = featureassemble.transform(test_spark)

train_spark = output.select('features', 'RainTomorrow') 
train_spark.show(n=5)

test_spark = testoutput.select('features', 'RainTomorrow')

+--------------------+------------+
|            features|RainTomorrow|
+--------------------+------------+
|[-0.9271615961677...|         1.0|
|[-0.7927114738425...|         1.0|
|[1.14495205378537...|         0.0|
|[-0.7407391576495...|         1.0|
|[-0.4277754275312...|         0.0|
+--------------------+------------+
only showing top 5 rows



In [27]:
def evaluate(results):
    # Print the accuracy, precision, recall and f1 score
    evaluator = MulticlassClassificationEvaluator(labelCol='RainTomorrow', predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(results)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    evaluator = MulticlassClassificationEvaluator(labelCol='RainTomorrow', predictionCol="prediction", metricName="weightedPrecision")
    weightedPrecision = evaluator.evaluate(results)
    print(f"Weighted Precision: {weightedPrecision * 100:.2f}%")

    evaluator = MulticlassClassificationEvaluator(labelCol='RainTomorrow', predictionCol="prediction", metricName="weightedRecall")
    weightedRecall = evaluator.evaluate(results)
    print(f"Weighted Recall: {weightedRecall * 100:.2f}%")

    evaluator = MulticlassClassificationEvaluator(labelCol='RainTomorrow', predictionCol="prediction", metricName="f1")
    f1 = evaluator.evaluate(results)
    print(f"F1 Score: {f1 * 100:.2f}%")

In [28]:
classifier = RandomForestClassifier(numTrees=100, maxDepth=7, labelCol='RainTomorrow', featuresCol='features')
classifier = classifier.fit(train_spark)

# Evaluate the model on the test data
results = classifier.transform(test_spark)
evaluate(results)

Accuracy: 80.17%
Weighted Precision: 80.12%
Weighted Recall: 80.17%
F1 Score: 80.11%


In [31]:
classifier = LogisticRegression(labelCol=output_col)
classifier = classifier.fit(train_spark)

# Evaluate the model on the test data
results = classifier.evaluate(test_spark) 
evaluate(results.predictions)

Accuracy: 79.55%
Weighted Precision: 79.51%
Weighted Recall: 79.55%
F1 Score: 79.46%


In [35]:
classifier = LinearSVC(labelCol=output_col, featuresCol='features')
classifier = classifier.fit(train_spark)

# Evaluate the model on the test data
results = classifier.transform(test_spark)
evaluate(results)

Accuracy: 79.67%
Weighted Precision: 79.62%
Weighted Recall: 79.67%
F1 Score: 79.59%


# Again but with P.preprocess(), you can just use it.

In [None]:
df2 = P.load_data('weatherAUS.csv')

In [None]:
X_train2, X_test2, y_train2, y_test2 = P.preprocess(df2)

In [None]:
models2 = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    GaussianNB()
]

In [None]:
s_t = time.time()
print('Combination:', combination)
for model in models2:
    print(model)
    acc, prec, rec, f1 = train_test_model(model, X_train2, X_test2, y_train2, y_test2, verbose=True)
    print(f"Time taken: {time.time() - s_t}")
    s_t = time.time()

    print()