In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt

# Execute the code below

In [2]:
import pandas as pd
import numpy as np
link_main = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather_main_2018.csv"
link_opinion = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather_opinion_2018.csv"
df_main = pd.read_csv(link_main)
df_opinion = pd.read_csv(link_opinion)
print(df_main.head())
print(df_opinion.head())

         DATE  MAX_TEMPERATURE_C  ...  DEWPOINT_MAX_C  WINDTEMP_MAX_C
0  2018-01-01                 12  ...               8               7
1  2018-01-02                 13  ...              12               6
2  2018-01-03                 15  ...              13               7
3  2018-01-04                 14  ...              12              10
4  2018-01-05                 12  ...              10               7

[5 rows x 15 columns]
         date  WEATHER_CODE_EVENING  TOTAL_SNOW_MM  UV_INDEX  SUNHOUR OPINION
0  2018-01-01                   113              0         3      5.1     bad
1  2018-03-12                   119              0         2      8.8     bad
2  2018-03-09                   116              0         3     10.2     bad
3  2018-10-07                   122              0         1      5.6     bad
4  2018-06-18                   119              0         1     12.9     bad


# Classification challenge

Your goal are :
- to merge both 2018 DataFrames
- to train-test split the new 2018 DataFrame
- to train 3 differents Machine Learning algorithms (KNN, logistic regression and decision tree) with "opinion" as target
- to try different parameters
- to find the best accuracy score (on the test set of course)
- to fill the missing values in the "opinion" columns whith your best model
- to explain what is the "rules" used by your model to predict the opinion.

You can help yourself with charts if you want.

In [80]:
# merge both 2018 DataFrames
df2018 = pd.merge(df_main ,
                  df_opinion, 
                  how="left", 
                  left_on='DATE',
                  right_on='date')
df2018.drop('date',             
            axis = 1,           
            inplace = True) 
df2018

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,PRESSURE_MAX_MB,CLOUDCOVER_AVG_PERCENT,HEATINDEX_MAX_C,DEWPOINT_MAX_C,WINDTEMP_MAX_C,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION
0,2018-01-01,12,8,61,9,11,8,8.9,79,9.500,1018,41.750,12,8,7,113.0,0.0,3.0,5.1,bad
1,2018-01-02,13,6,26,8,12,13,0.6,96,9.000,1020,87.875,13,12,6,122.0,0.0,3.0,3.3,bad
2,2018-01-03,15,10,40,11,12,10,5.5,82,8.500,1017,91.500,15,13,7,122.0,0.0,3.0,3.3,bad
3,2018-01-04,14,11,45,14,14,11,0.0,89,10.000,1011,90.125,14,12,10,116.0,0.0,3.0,3.3,bad
4,2018-01-05,12,7,21,10,11,8,1.5,85,9.875,1005,62.375,12,10,7,116.0,0.0,3.0,6.9,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2018-12-27,7,2,6,2,6,5,0.0,85,10.000,1027,30.750,8,6,3,119.0,0.0,1.0,8.7,very bad
361,2018-12-28,7,2,8,2,7,3,0.0,89,8.000,1035,18.750,8,4,4,113.0,0.0,1.0,8.7,very bad
362,2018-12-29,7,1,6,1,6,4,0.0,94,7.000,1038,33.000,8,5,1,116.0,0.0,1.0,8.7,very bad
363,2018-12-30,9,4,6,5,9,8,0.1,95,6.000,1038,70.375,10,9,7,143.0,0.0,1.0,3.3,very bad


In [11]:
df2018.isnull().sum()

DATE                       0
MAX_TEMPERATURE_C          0
MIN_TEMPERATURE_C          0
WINDSPEED_MAX_KMH          0
TEMPERATURE_MORNING_C      0
TEMPERATURE_NOON_C         0
TEMPERATURE_EVENING_C      0
PRECIP_TOTAL_DAY_MM        0
HUMIDITY_MAX_PERCENT       0
VISIBILITY_AVG_KM          0
PRESSURE_MAX_MB            0
CLOUDCOVER_AVG_PERCENT     0
HEATINDEX_MAX_C            0
DEWPOINT_MAX_C             0
WINDTEMP_MAX_C             0
WEATHER_CODE_EVENING      24
TOTAL_SNOW_MM             24
UV_INDEX                  24
SUNHOUR                   24
OPINION                   24
dtype: int64

In [86]:
df2018_clean = df2018.dropna()
df2018_nan = df2018[df2018['OPINION'].isna()]

In [87]:
# train-test split the new 2018 DataFrame
X = df2018_clean.iloc[:, 1:-5]
y = df2018_clean['OPINION']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.75)

KNN

In [88]:
#entrainement du modèle KNN avec les paramètres par défaut
modelKNN = KNeighborsClassifier().fit(X_train, y_train)
print(f"score Train = {modelKNN.score(X_train, y_train)}, score Test = {modelKNN.score(X_test, y_test)}")

score Train = 0.8, score Test = 0.7790697674418605


In [89]:
#Test avec différentes valeurs pour les paramètres n_neighbors et weights
for i in range(2,11):
  modelTest = KNeighborsClassifier(n_neighbors = i, weights = 'distance').fit(X_train, y_train)
  print(f"n_neighbors = {i}, weights = 'distance', score Train = {modelTest.score(X_train, y_train)}, score Test = {modelTest.score(X_test, y_test)}")
  modelTest = KNeighborsClassifier(n_neighbors = i, weights = 'uniform').fit(X_train, y_train)
  print(f"n_neighbors = {i}, weights = 'uniform', score Train = {modelTest.score(X_train, y_train)}, score Test = {modelTest.score(X_test, y_test)}")

n_neighbors = 2, weights = 'distance', score Train = 1.0, score Test = 0.7674418604651163
n_neighbors = 2, weights = 'uniform', score Train = 0.8470588235294118, score Test = 0.7325581395348837
n_neighbors = 3, weights = 'distance', score Train = 1.0, score Test = 0.8023255813953488
n_neighbors = 3, weights = 'uniform', score Train = 0.8509803921568627, score Test = 0.813953488372093
n_neighbors = 4, weights = 'distance', score Train = 1.0, score Test = 0.813953488372093
n_neighbors = 4, weights = 'uniform', score Train = 0.8117647058823529, score Test = 0.7790697674418605
n_neighbors = 5, weights = 'distance', score Train = 1.0, score Test = 0.7558139534883721
n_neighbors = 5, weights = 'uniform', score Train = 0.8, score Test = 0.7790697674418605
n_neighbors = 6, weights = 'distance', score Train = 1.0, score Test = 0.7906976744186046
n_neighbors = 6, weights = 'uniform', score Train = 0.792156862745098, score Test = 0.7325581395348837
n_neighbors = 7, weights = 'distance', score Tra

In [None]:
#Les combinaisons de paramètres n_neighbors = 3, weights = 'uniform' et n_neighbors = 4, weights = 'distance'
#obtiennent le même meilleur résultat sur le jeu de test
#cependant il y a moins d'overfitting avec la 17re combinaison, on gardera donc celle là :

#######    n_neighbors = 3, weights = 'uniform', score Train = 0.8509803921568627, score Test = 0.813953488372093   #######

In [51]:
#La confusion matrix semble plutôt bonne, la plupart des résultats sont bien prédits (diagonale)
#Lorsqu'ils ne sont pas bien prédits, les résultats sont quand même dans une opinion "proche" : pas de very good classés en very bad par exemple

model_choisi_1 = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform').fit(X_train, y_train)

pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = model_choisi_1.predict(X_test)),
             index = model_choisi_1.classes_ + " actual",
             columns = model_choisi_1.classes_ + " predicted")

Unnamed: 0,bad predicted,good predicted,not good not bad predicted,very bad predicted,very good predicted
bad actual,15,0,1,1,0
good actual,0,17,1,0,3
not good not bad actual,3,3,13,0,0
very bad actual,1,1,0,22,0
very good actual,0,2,0,0,3


Logistic Regression

In [52]:
#entrainement du modèle LR avec les paramètres par défaut
modelLR = LogisticRegression().fit(X_train,y_train)
print(f"score Train = {modelLR.score(X_train, y_train)}, score Test = {modelLR.score(X_test, y_test)}")

score Train = 0.807843137254902, score Test = 0.7558139534883721


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [53]:
modelLR.n_iter_

array([100], dtype=int32)

In [56]:
#Augmentation du nombre d'iterations, comme conseillé dans le message d'erreur ci-dessus
modelLR_iter = LogisticRegression(max_iter=5000).fit(X_train,y_train)
print(f"score Train = {modelLR.score(X_train, y_train)}, score Test = {modelLR.score(X_test, y_test)}")

score Train = 0.807843137254902, score Test = 0.7558139534883721


In [57]:
modelLR_iter.n_iter_

array([3740], dtype=int32)

In [59]:
#Essai avec différentes valeurs pour le paramètre solver :
for k in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] :
  modelLR_solver = LogisticRegression(max_iter=5000, solver = k).fit(X_train,y_train)
  print(f"solver = {k}, score Train = {modelLR_solver.score(X_train, y_train)}, score Test = {modelLR_solver.score(X_test, y_test)}")

solver = newton-cg, score Train = 0.8980392156862745, score Test = 0.8023255813953488
solver = lbfgs, score Train = 0.8941176470588236, score Test = 0.7906976744186046
solver = liblinear, score Train = 0.8235294117647058, score Test = 0.7674418604651163
solver = sag, score Train = 0.8352941176470589, score Test = 0.7790697674418605
solver = saga, score Train = 0.8352941176470589, score Test = 0.7790697674418605




In [None]:
#on obtient le meilleur résultat avec le paramètre 'newton-cg', malgré un peu d'overfitting, c'est donc celui là que l'on va garder :
#######    max_iter = 5000, solver = newton-cg, score Train = 0.8980392156862745, score Test = 0.8023255813953488     #######

In [60]:
model_choisi_2 = LogisticRegression(max_iter=5000, solver = 'newton-cg').fit(X_train,y_train)

pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = model_choisi_2.predict(X_test)),
             index = model_choisi_2.classes_ + " actual",
             columns = model_choisi_2.classes_ + " predicted")

Unnamed: 0,bad predicted,good predicted,not good not bad predicted,very bad predicted,very good predicted
bad actual,15,0,2,0,0
good actual,0,15,3,0,3
not good not bad actual,5,3,11,0,0
very bad actual,0,0,0,24,0
very good actual,0,1,0,0,4


In [None]:
# Ceux qui sont le moins bien préduit sont 'not good not bad', pas étonnant car c'est une valeur intermédiaire dont la définition même est un peu 'entre deux'

In [None]:
#Essayons de donner des poids différents à cette catégorie pour voir si le résultat est meilleur

In [61]:
for k in range(2,12) :
  modelLR_weights = LogisticRegression(max_iter= 5000, solver = 'newton-cg', class_weight = {'not good not bad' : k}).fit(X_train,y_train)
  print(f"weights = {k}, score Train = {modelLR_weights.score(X_train, y_train)}, score Test = {modelLR_weights.score(X_test, y_test)}")

weights = 2, score Train = 0.8901960784313725, score Test = 0.7674418604651163




weights = 3, score Train = 0.8862745098039215, score Test = 0.7790697674418605




weights = 4, score Train = 0.8784313725490196, score Test = 0.7674418604651163
weights = 5, score Train = 0.8588235294117647, score Test = 0.7441860465116279
weights = 6, score Train = 0.8627450980392157, score Test = 0.7441860465116279
weights = 7, score Train = 0.8627450980392157, score Test = 0.7441860465116279
weights = 8, score Train = 0.8627450980392157, score Test = 0.7441860465116279
weights = 9, score Train = 0.8509803921568627, score Test = 0.7325581395348837
weights = 10, score Train = 0.8470588235294118, score Test = 0.7441860465116279
weights = 11, score Train = 0.8431372549019608, score Test = 0.7441860465116279


In [None]:
#Les résultats ne sont pas très concluants, on reste donc sur le modèle choisi précédemment

Decision Tree

In [62]:
#entrainement du modèle LR avec les paramètres par défaut
modelDTC = DecisionTreeClassifier().fit(X_train, y_train)
print(f"score Train = {modelDTC.score(X_train, y_train)}, score Test = {modelDTC.score(X_test, y_test)}")

score Train = 1.0, score Test = 0.9069767441860465


In [63]:
#Si on ne met pas de limite max_depth on tombe très vite dans de l'overfitting, ceci dit le résultat du jeu de test est aussi très bon.
#Essayons avec plusieurs valeurs de max_depth
for k in range(1,12) :
  modelDTC_depth = DecisionTreeClassifier(max_depth = k).fit(X_train, y_train)
  print(f"max_depth = {k}, score Train = {modelDTC_depth.score(X_train, y_train)}, score Test = {modelDTC_depth.score(X_test, y_test)}")

max_depth = 1, score Train = 0.5019607843137255, score Test = 0.5232558139534884
max_depth = 2, score Train = 0.7333333333333333, score Test = 0.686046511627907
max_depth = 3, score Train = 0.8274509803921568, score Test = 0.8372093023255814
max_depth = 4, score Train = 0.8588235294117647, score Test = 0.8372093023255814
max_depth = 5, score Train = 0.9019607843137255, score Test = 0.8953488372093024
max_depth = 6, score Train = 0.9450980392156862, score Test = 0.9069767441860465
max_depth = 7, score Train = 0.9647058823529412, score Test = 0.9069767441860465
max_depth = 8, score Train = 0.984313725490196, score Test = 0.8837209302325582
max_depth = 9, score Train = 0.9882352941176471, score Test = 0.9302325581395349
max_depth = 10, score Train = 0.9882352941176471, score Test = 0.9069767441860465
max_depth = 11, score Train = 0.996078431372549, score Test = 0.9069767441860465


In [None]:
#max_depth = 5 me semble être un bon compromis entre overfitting et qualité du modèle. 
#Et on obtient déjà de meilleurs résultats qu'avec les autres algos

########    max_depth = 5, score Train = 0.9019607843137255, score Test = 0.8953488372093024   ##########

In [64]:
#Jettons un oeil à la confusion matrix : les résultats sont très bons !
model_choisi_3 = DecisionTreeClassifier(max_depth = 5).fit(X_train, y_train)

pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = model_choisi_3.predict(X_test)),
             index = model_choisi_3.classes_ + " actual",
             columns = model_choisi_3.classes_ + " predicted")

Unnamed: 0,bad predicted,good predicted,not good not bad predicted,very bad predicted,very good predicted
bad actual,17,0,0,0,0
good actual,0,18,0,0,3
not good not bad actual,1,6,12,0,0
very bad actual,0,0,0,24,0
very good actual,0,0,0,0,5


Fill the opinion column 

In [91]:
df2018_nan['OPINION'] = model_choisi_3.predict(df2018_nan.iloc[:, 1:-5])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [92]:
df2018_nan

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,PRESSURE_MAX_MB,CLOUDCOVER_AVG_PERCENT,HEATINDEX_MAX_C,DEWPOINT_MAX_C,WINDTEMP_MAX_C,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION
31,2018-02-01,8,3,17,4,8,4,0.2,81,10.0,1012,39.5,8,3,2,,,,,very bad
66,2018-03-08,13,7,23,7,12,9,0.4,87,9.75,1005,52.625,13,7,6,,,,,bad
100,2018-04-11,15,7,14,7,15,12,1.2,87,7.875,1002,59.125,15,10,10,,,,,not good not bad
118,2018-04-29,10,7,23,8,10,8,0.7,93,9.0,1008,85.0,10,9,7,,,,,bad
143,2018-05-24,23,12,8,14,22,21,1.4,81,7.875,1018,27.25,25,17,19,,,,,good
152,2018-06-02,23,12,5,15,23,21,1.0,75,9.875,1021,24.75,25,18,20,,,,,good
163,2018-06-13,20,12,10,13,20,19,0.3,72,9.0,1020,36.875,21,16,18,,,,,good
165,2018-06-15,19,13,16,14,19,18,0.1,85,8.0,1019,61.625,19,15,16,,,,,good
175,2018-06-25,25,13,22,15,24,24,0.0,71,10.0,1023,4.75,26,19,20,,,,,good
184,2018-07-04,25,16,15,18,24,21,0.0,74,7.75,1017,41.125,26,17,22,,,,,good


Explain what is the "rules" used by your model to predict the opinion.

In [None]:
# L'arbre de décision segmente le jeu de données en répondant à des questions Vrai/Faux qui permettent de classer d'un côté les Vrai de l'autre côté 
# les Faux puis de poser une autre 'question' et ainsi de suite jusqu'à obtenir une classification très proche de la réalité.

# Pour un modèle plus généralisable on réduit le nombre de 'question' à 5 et on prend donc la classification obtenu après ces 5 segmentations.
