# TODO
- try combinations
- test already written functions
- try other methods
- remove unused functions, options
- knn imputation seems too slow, unusable
- split (train, test) before everything (imputation, sampling, etc.)

In [67]:
from preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import itertools
import time

In [68]:
def train_test_model(model, X_train, X_test, y_train, y_test, average='weighted', verbose=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average=average)
    rec = recall_score(y_test, y_pred, average=average)
    f1 = f1_score(y_test, y_pred, average=average)

    if verbose:
        print(f"Accuracy: {acc}")
        print(f"Precision: {prec}")
        print(f"Recall: {rec}")
        print(f"F1: {f1}")
        print()
    return acc, prec, rec, f1


In [69]:
P = Preprocessor()
df = P.load_data('weatherAUS.csv')

In [70]:
print(P.imputation_types())
print(P.outlier_types())
print(P.encoding_types())
print(P.scaling_types())
print(P.discretize_types())
print(P.sampling_types())


['mean', 'median', 'mode', 'drop', 'knn', 'interpolate', 'mode_mean', 'mode_median', 'mode_knn', 'mode_interpolate', 'mode_mode', 'None']
['mean', 'median', 'mode', 'drop', 'None']
['onehot', 'ordinal', 'None']
['standard', 'minmax', 'robust', 'None']
['equal_width', 'equal_freq', 'None']
['smote', 'random', 'smotenc', 'smoten', 'under', 'None']


In [71]:
combination = {
    'imputation': 'mode_mean',
    'outlier': 'median',
    'encoding': 'ordinal',
    'scaling': 'standard',
    'discritization': 'None',
    'sampling': 'None',
}

# encode_first = ('_' not in combination['imputation']) # imputation might need all to be encoded first (numeric)
encode_first = False

In [72]:
df_p = P.handleDate(df, 'Date')

In [73]:
df_p.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,year,month,day,RainTomorrow
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,1007.1,8.0,,16.9,21.8,No,2008,12,1,No
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,1007.8,,,17.2,24.3,No,2008,12,2,No
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,1008.7,,2.0,21.0,23.2,No,2008,12,3,No
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,1012.8,,,18.1,26.5,No,2008,12,4,No
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,2008,12,5,No


In [74]:
if encode_first:
    df_p = P.encode(df_p, combination['encoding'])

In [75]:
df_p.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,year,month,day,RainTomorrow
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,1007.1,8.0,,16.9,21.8,No,2008,12,1,No
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,1007.8,,,17.2,24.3,No,2008,12,2,No
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,1008.7,,2.0,21.0,23.2,No,2008,12,3,No
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,1012.8,,,18.1,26.5,No,2008,12,4,No
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,2008,12,5,No


In [76]:
df_p = P.impute(df_p, combination['imputation'])

In [77]:
df_p.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,year,month,day,RainTomorrow
0,Albury,13.4,22.9,0.6,5.468232,7.611178,W,44.0,W,WNW,...,1007.1,8.0,4.50993,16.9,21.8,No,2008,12,1,No
1,Albury,7.4,25.1,0.0,5.468232,7.611178,WNW,44.0,NNW,WSW,...,1007.8,4.447461,4.50993,17.2,24.3,No,2008,12,2,No
2,Albury,12.9,25.7,0.0,5.468232,7.611178,WSW,46.0,W,WSW,...,1008.7,4.447461,2.0,21.0,23.2,No,2008,12,3,No
3,Albury,9.2,28.0,0.0,5.468232,7.611178,NE,24.0,SE,E,...,1012.8,4.447461,4.50993,18.1,26.5,No,2008,12,4,No
4,Albury,17.5,32.3,1.0,5.468232,7.611178,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,2008,12,5,No


In [78]:
df_p.nunique()

Location          49
MinTemp          390
MaxTemp          506
Rainfall         682
Evaporation      359
Sunshine         146
WindGustDir       16
WindGustSpeed     68
WindDir9am        16
WindDir3pm        16
WindSpeed9am      44
WindSpeed3pm      45
Humidity9am      102
Humidity3pm      102
Pressure9am      547
Pressure3pm      550
Cloud9am          11
Cloud3pm          11
Temp9am          442
Temp3pm          503
RainToday          2
year              11
month             12
day               31
RainTomorrow       2
dtype: int64

In [79]:
df_p = P.outlier(df_p, combination['outlier'])

In [80]:
df_p.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,year,month,day,RainTomorrow
0,Albury,13.4,22.9,0.6,5.468232,7.611178,W,44.0,W,WNW,...,1007.1,8.0,4.50993,16.9,21.8,No,2008,12,1,No
1,Albury,7.4,25.1,0.0,5.468232,7.611178,WNW,44.0,NNW,WSW,...,1007.8,4.447461,4.50993,17.2,24.3,No,2008,12,2,No
2,Albury,12.9,25.7,0.0,5.468232,7.611178,WSW,46.0,W,WSW,...,1008.7,4.447461,2.0,21.0,23.2,No,2008,12,3,No
3,Albury,9.2,28.0,0.0,5.468232,7.611178,NE,24.0,SE,E,...,1012.8,4.447461,4.50993,18.1,26.5,No,2008,12,4,No
4,Albury,17.5,32.3,1.0,5.468232,7.611178,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,2008,12,5,No


In [81]:
if not encode_first:
    df_p = P.encode(df_p, combination['encoding'])

In [82]:
df_p.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,year,month,day,RainTomorrow
0,2,194,201,6,37,17,13,20,13,14,...,84,9,4,183,194,0,1,11,0,0
1,2,133,224,0,37,17,14,20,6,15,...,91,5,4,187,219,0,1,11,1,0
2,2,189,230,0,37,17,15,21,13,15,...,100,5,1,225,208,0,1,11,2,0
3,2,151,253,0,37,17,4,8,9,0,...,142,5,4,196,241,0,1,11,3,0
4,2,235,296,10,37,17,13,18,1,7,...,73,8,8,193,273,0,1,11,4,0


In [83]:
X_train, X_test, y_train, y_test = P.split(df_p)

In [84]:
(y_train == 0).all()

False

In [85]:
X_train_res, y_train_res = P.sampleXy(X_train, y_train, sampling_strategy=combination['sampling'])

In [86]:
X_train_res = P.scaleX(X_train_res)
if P.scaler is not None:
    X_test = P.scaler.transform(X_test) # scale test set with train set scaler

In [87]:
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    GaussianNB()
]

In [88]:
s_t = time.time()
print('Combination:', combination)
for model in models:
    print(model)
    # if knn: scale, if nb: discretize
    if model.__class__.__name__ == 'KNeighborsClassifier':
        pass
        # X_train_res2 = P.scaleX(X_train_res)
        # X_test2 = P.scaleX(X_test)
    elif model.__class__.__name__ == 'GaussianNB':
        pass
        # X_train_res2 = P.discretizeX(X_train_res)
        # X_test2 = P.discretizeX(X_test)
    else:
        X_train_res2 = X_train_res
        X_test2 = X_test
    acc, prec, rec, f1 = train_test_model(model, X_train_res2, X_test2, y_train_res, y_test, verbose=True)
    with open('result.log', 'a') as f:
        f.write(f"{model.__class__.__name__}:\n")
        f.write(f"Combination: {combination}\n")
        f.write(f"Accuracy: {acc}\n")
        f.write(f"Precision: {prec}\n")
        f.write(f"Recall: {rec}\n")
        f.write(f"F1: {f1}\n")
        f.write('\n')
    print(f"Time taken: {time.time() - s_t}")
    s_t = time.time()

    print()
with open('result.log', 'a') as f:
    f.write('-'*50 + '\n')

Combination: {'imputation': 'mode_mean', 'outlier': 'median', 'encoding': 'ordinal', 'scaling': 'standard', 'discritization': 'None', 'sampling': 'None'}
LogisticRegression()
Accuracy: 0.8366561253952977
Precision: 0.8243226379585802
Recall: 0.8366561253952977
F1: 0.8232110228209084

Time taken: 0.2611706256866455

DecisionTreeClassifier()
Accuracy: 0.7800082496906366
Precision: 0.7820872746124531
Recall: 0.7800082496906366
F1: 0.7810198092026849

Time taken: 1.4410078525543213

RandomForestClassifier()
Accuracy: 0.849958751546817
Precision: 0.840950505336467
Recall: 0.849958751546817
F1: 0.8364175944663433

Time taken: 18.4835786819458

KNeighborsClassifier()
Accuracy: 0.828097071359824
Precision: 0.8143443745525578
Recall: 0.828097071359824
F1: 0.8154616297343023

Time taken: 3.3687620162963867

GaussianNB()
Accuracy: 0.7847174480956964
Precision: 0.8079153197472758
Recall: 0.7847174480956964
F1: 0.7933583739995339

Time taken: 0.06643533706665039

