In [49]:
! pip install scikit-learn-extra
! pip install kmodes
! pip install plotly
! pip install graphviz


Collecting plotly
  Downloading plotly-5.13.0-py2.py3-none-any.whl (15.2 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.1.0-py3-none-any.whl (23 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.13.0 tenacity-8.1.0
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.20.1


In [51]:


import pandas as pd # for data manipulation
import numpy as np # for data manipulation

from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn import tree # for decision tree models

import plotly.express as px  # for data visualization
import plotly.graph_objects as go # for data visualization
import graphviz # for plotting decision tree graphs
from sklearn.tree import DecisionTreeRegressor

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 500)


In [114]:
def fitting(X, y, criterion, splitter, mdepth, clweight, minleaf):

    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

    # Fit the model
    model = tree.DecisionTreeClassifier(criterion=criterion, 
                                        splitter=splitter, 
                                        max_depth=mdepth,
                                        class_weight=clweight,
                                        min_samples_leaf=minleaf, 
                                        random_state=0, 
                                  )
    clf = model.fit(X_train, y_train)

    # Predict class labels on training data
    pred_labels_tr = model.predict(X_train)
    # Predict class labels on a test data
    pred_labels_te = model.predict(X_test)

    # Tree summary and model evaluation metrics
    print('*************** Tree Summary ***************')
    print('Classes: ', clf.classes_)
    print('Tree Depth: ', clf.tree_.max_depth)
    print('No. of leaves: ', clf.tree_.n_leaves)
    print('No. of features: ', clf.n_features_in_)
    print('--------------------------------------------------------')
    print("")
    
    print('*************** Evaluation on Test Data ***************')
    score_te = model.score(X_test, y_test)
    print('Accuracy Score: ', score_te)
    # Look at classification report to evaluate the model
    print(classification_report(y_test, pred_labels_te))
    print('--------------------------------------------------------')
    print("")
    
    print('*************** Evaluation on Training Data ***************')
    score_tr = model.score(X_train, y_train)
    print('Accuracy Score: ', score_tr)
    # Look at classification report to evaluate the model
    print(classification_report(y_train, pred_labels_tr))
    print('--------------------------------------------------------')
    
  
    
    # Return relevant data for chart plotting
    return X_train, X_test, y_train, y_test, clf

In [81]:

data = pd.read_csv("DataByMonthsA.csv", low_memory = False)
data


Unnamed: 0,member_noSource,Month,Total
0,8996,3,4424.2
1,9587,3,20645.9
2,7626,3,4415.4
3,4409,3,6281.0
4,7315,3,9786.7
...,...,...,...
55434,6675,10,124464.0
55435,9778,10,409872.0
55436,6561,10,105712.0
55437,9403,10,101728.0


In [82]:

members_reg_gen = pd.read_csv("members-regions-gender.csv",low_memory=False,sep=';')

df_new = data.pivot(index='member_noSource' ,columns='Month', values='Total').reset_index()
df_new.fillna(0, inplace=True)

df_new

Month,member_noSource,3,4,5,6,7,8,9,10
0,63,0.0,0.0,0.0,0.0,564.3,0.0,0.0,8048.0
1,80,0.0,0.0,0.0,437.8,0.0,0.0,6208.0,0.0
2,83,0.0,0.0,0.0,3653.1,0.0,0.0,52976.0,0.0
3,84,0.0,0.0,1856.8,0.0,0.0,26848.0,0.0,0.0
4,89,0.0,0.0,0.0,1667.6,0.0,0.0,24096.0,0.0
...,...,...,...,...,...,...,...,...,...
9109,9996,13588.3,63930.9,52202.7,241104.9,951931.9,756272.0,643344.0,362864.0
9110,9997,19144.4,92967.6,48318.6,331394.8,1365365.6,699296.0,788128.0,257056.0
9111,9998,29543.8,53007.9,76594.1,491150.3,783980.3,1109616.0,913488.0,236368.0
9112,9999,17318.4,83598.9,59677.2,293421.6,1239378.5,864512.0,615136.0,401680.0


In [83]:

#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(df_new)
#df = pd.DataFrame(x_scaled)

#df['member_noSource'] = source_mem

df = df_new

df

Month,member_noSource,3,4,5,6,7,8,9,10
0,63,0.0,0.0,0.0,0.0,564.3,0.0,0.0,8048.0
1,80,0.0,0.0,0.0,437.8,0.0,0.0,6208.0,0.0
2,83,0.0,0.0,0.0,3653.1,0.0,0.0,52976.0,0.0
3,84,0.0,0.0,1856.8,0.0,0.0,26848.0,0.0,0.0
4,89,0.0,0.0,0.0,1667.6,0.0,0.0,24096.0,0.0
...,...,...,...,...,...,...,...,...,...
9109,9996,13588.3,63930.9,52202.7,241104.9,951931.9,756272.0,643344.0,362864.0
9110,9997,19144.4,92967.6,48318.6,331394.8,1365365.6,699296.0,788128.0,257056.0
9111,9998,29543.8,53007.9,76594.1,491150.3,783980.3,1109616.0,913488.0,236368.0
9112,9999,17318.4,83598.9,59677.2,293421.6,1239378.5,864512.0,615136.0,401680.0


In [84]:
d = pd.merge(df,members_reg_gen,left_on='member_noSource',right_on='member_noSource',how="inner")

d

Unnamed: 0,member_noSource,3,4,5,6,7,8,9,10,region_noSource,gender
0,63,0.0,0.0,0.0,0.0,564.3,0.0,0.0,8048.0,8,0
1,80,0.0,0.0,0.0,437.8,0.0,0.0,6208.0,0.0,7,1
2,83,0.0,0.0,0.0,3653.1,0.0,0.0,52976.0,0.0,5,0
3,84,0.0,0.0,1856.8,0.0,0.0,26848.0,0.0,0.0,2,0
4,89,0.0,0.0,0.0,1667.6,0.0,0.0,24096.0,0.0,4,0
...,...,...,...,...,...,...,...,...,...,...,...
9109,9996,13588.3,63930.9,52202.7,241104.9,951931.9,756272.0,643344.0,362864.0,8,1
9110,9997,19144.4,92967.6,48318.6,331394.8,1365365.6,699296.0,788128.0,257056.0,3,0
9111,9998,29543.8,53007.9,76594.1,491150.3,783980.3,1109616.0,913488.0,236368.0,3,1
9112,9999,17318.4,83598.9,59677.2,293421.6,1239378.5,864512.0,615136.0,401680.0,2,0


In [111]:
lastData=df.copy().drop('member_noSource',axis=1)
lastData

Month,3,4,5,6,7,8,9,10
0,0.0,0.0,0.0,0.0,564.3,0.0,0.0,8048.0
1,0.0,0.0,0.0,437.8,0.0,0.0,6208.0,0.0
2,0.0,0.0,0.0,3653.1,0.0,0.0,52976.0,0.0
3,0.0,0.0,1856.8,0.0,0.0,26848.0,0.0,0.0
4,0.0,0.0,0.0,1667.6,0.0,0.0,24096.0,0.0
...,...,...,...,...,...,...,...,...
9109,13588.3,63930.9,52202.7,241104.9,951931.9,756272.0,643344.0,362864.0
9110,19144.4,92967.6,48318.6,331394.8,1365365.6,699296.0,788128.0,257056.0
9111,29543.8,53007.9,76594.1,491150.3,783980.3,1109616.0,913488.0,236368.0
9112,17318.4,83598.9,59677.2,293421.6,1239378.5,864512.0,615136.0,401680.0


In [125]:
Y = lastData[10]

Y

0         8048.0
1            0.0
2            0.0
3            0.0
4            0.0
          ...   
9109    362864.0
9110    257056.0
9111    236368.0
9112    401680.0
9113    217296.0
Name: 10, Length: 9114, dtype: float64

In [126]:

X = lastData.drop([10],axis=1)

X

Month,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,564.3,0.0,0.0
1,0.0,0.0,0.0,437.8,0.0,0.0,6208.0
2,0.0,0.0,0.0,3653.1,0.0,0.0,52976.0
3,0.0,0.0,1856.8,0.0,0.0,26848.0,0.0
4,0.0,0.0,0.0,1667.6,0.0,0.0,24096.0
...,...,...,...,...,...,...,...
9109,13588.3,63930.9,52202.7,241104.9,951931.9,756272.0,643344.0
9110,19144.4,92967.6,48318.6,331394.8,1365365.6,699296.0,788128.0
9111,29543.8,53007.9,76594.1,491150.3,783980.3,1109616.0,913488.0
9112,17318.4,83598.9,59677.2,293421.6,1239378.5,864512.0,615136.0


In [129]:

X_train, X_test, y_train, y_test, clf = fitting(X, Y, 'gini', 'best', 
                                                       mdepth=None, 
                                                       clweight=None,
                                                       minleaf=1000)

*************** Tree Summary ***************
Classes:  [0.00000e+00 1.12000e+02 1.44000e+02 ... 5.03600e+05 5.43568e+05
 5.52928e+05]
Tree Depth:  3
No. of leaves:  5
No. of features:  7
--------------------------------------------------------

*************** Evaluation on Test Data ***************
Accuracy Score:  0.42073505211190343
              precision    recall  f1-score   support

         0.0       0.42      1.00      0.59       767
       448.0       0.00      0.00      0.00         1
       528.0       0.00      0.00      0.00         1
       544.0       0.00      0.00      0.00         1
       560.0       0.00      0.00      0.00         1
       720.0       0.00      0.00      0.00         1
       816.0       0.00      0.00      0.00         1
      1088.0       0.00      0.00      0.00         1
      1312.0       0.00      0.00      0.00         1
      1328.0       0.00      0.00      0.00         1
      1360.0       0.00      0.00      0.00         1
      1488.0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
