In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import numpy as np
import time
import datetime as datetime

In [2]:
#Inladen van de datasets out & aanmeldingen.

out = pd.read_csv('../data/BenCom/src/out.csv', encoding='latin-1', sep=';')
aanmeldingen = pd.read_csv('../data/BenCom/src/data/aanmeldingen.csv', encoding='latin-1', sep=';')

In [3]:
#isolate results
churn = aanmeldingen['TypeAanmelding']
aanmeldingen['churn'] = np.where(churn == 'overstap',1,0)

In [4]:
#drop irrelevant features
drop = ['TypeAanmelding', 'ProviderNaam', 'Woonplaats']
aanmeldingen = aanmeldingen.drop(drop, axis=1)

In [5]:
#convert type 'TypePakket' and 'TypePakket' to ordinal numbers
aanmeldingen = pd.get_dummies(aanmeldingen, columns=['TypePakket', 'GroenGrijs'])

In [6]:
#convert 'Aanmelddatum' to timeseries. 
aanmeldingen['Aanmelddatum'] = aanmeldingen['Aanmelddatum'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S.%f'))
aanmeldingen=aanmeldingen.rename(columns = {'Aanmelddatum':'Datum'})
aanmeldingen['Datum'] = aanmeldingen['Datum'].apply(lambda x: x.date())

In [7]:
#Delete to 4 broken date rows
out = out.reindex(columns= ["Datum", "Medium-partij", "Medium-cat", "Locatie", "Titel/kop", "Timing", "Opmerkingen", "day", "count"])
out = out.drop(out.index[[0,24,30, 60]])

In [8]:
# devide the Medium category in to types online (1) and offline(0)
out['Medium-cat'] = out['Medium-cat'].apply(lambda x: 1 if x == 'Online' else 0)

In [9]:
out['Datum'] = out['Datum'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
out['Datum'] = out['Datum'].apply(lambda x: x.date())

In [10]:
# !Medium-partij has been dropped. Could be a valuable feature! Discuss with with Wico.
drop = [ 'Medium-partij', 'Titel/kop', 'Opmerkingen', 'Locatie', 'day', 'count']
out = out.drop(drop, axis=1)
out['event'] = int(1)
out = out.dropna(subset = ['Timing'])

## Merge datasets aanmeldingen and out

In [11]:
# convert weekday in numeric values
# rename timing in weekday
result = pd.merge(out, aanmeldingen, on='Datum', how='outer')
result = result.replace({'Timing' : {'maandag' : int(1), 'dinsdag' : int(2), 'woensdag' : int(3), 'woensdag ' : int(3), 'donderdag': int(4), 'vrijdag':int(5), 'zaterdag': int(6), 'zondag': int(7)}})
result= result.rename(columns = {'Timing':'Weekday'})
result = result.fillna(0)
result['Weekday']= pd.to_numeric(result['Weekday'])
result

Unnamed: 0,Datum,Medium-cat,Weekday,event,ProviderId,Leeftijd,churn,TypePakket_combi,TypePakket_electricity,TypePakket_gas,GroenGrijs_grijs,GroenGrijs_groen
0,2014-01-07,0.0,2.0,1.0,27.0,89.0,1.0,1.0,0.0,0.0,1.0,0.0
1,2014-01-07,0.0,2.0,1.0,27.0,66.0,1.0,1.0,0.0,0.0,1.0,0.0
2,2014-01-07,0.0,2.0,1.0,17.0,60.0,1.0,1.0,0.0,0.0,1.0,0.0
3,2014-01-07,0.0,2.0,1.0,60.0,58.0,1.0,1.0,0.0,0.0,1.0,0.0
4,2014-01-07,0.0,2.0,1.0,5.0,29.0,1.0,1.0,0.0,0.0,1.0,0.0
5,2014-01-07,0.0,2.0,1.0,53.0,51.0,1.0,1.0,0.0,0.0,1.0,0.0
6,2014-01-07,0.0,2.0,1.0,5.0,27.0,1.0,1.0,0.0,0.0,1.0,0.0
7,2014-01-07,0.0,2.0,1.0,27.0,31.0,1.0,1.0,0.0,0.0,1.0,0.0
8,2014-01-07,0.0,2.0,1.0,27.0,22.0,0.0,1.0,0.0,0.0,1.0,0.0
9,2014-01-07,0.0,2.0,1.0,27.0,28.0,0.0,1.0,0.0,0.0,1.0,0.0


# Feature selection

In [12]:
# Recursive Feature Elimination
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
#Put data in array
array = result.values
X = array[:,1:11]
Y = array[:,6]
Y=Y.astype('int')
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)



[  6.29354227e-05   1.61630351e-04   2.51955729e-05   4.98419580e-04
   1.32628963e-02   9.82680075e-01   7.37947146e-04   2.29010144e-03
   1.14923433e-04   1.65876197e-04]


In [13]:
# Feature Extraction with PCA
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:20]
Y = array[:,8]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print(fit.explained_variance_ratio_)
print(fit.components_)

URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

In [None]:
result.unique()

In [None]:
drop = ['Medium-partij', 'ProviderNaam', 'Woonplaats']
aanmeldingen = aanmeldingen.drop(drop, axis=1)

In [None]:
drop = ['Medium-partij', 'ProviderNaam', 'Woonplaats']
aanmeldingen = aanmeldingen.drop(drop, axis=1)

In [None]:
drop = ['Medium-partij', 'ProviderNaam', 'Woonplaats']
aanmeldingen = aanmeldingen.drop(drop, axis=1)

In [None]:
result.head(20000)

In [None]:
result["Datum"] = result["Datum"].astype("datetime64")

## Plot your data for interdependencies

In [None]:
result['event'].groupby(result["Datum"].dt.day).count().plot(kind="bar")


## Fit the RandomForest Classifier

In [None]:
#Feuture importance

from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import r2_score
from collections import defaultdict
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import numpy as np

X = result["Leeftijd", "Medium-cat", "Weekday", "event"]
Y = result["churn"]
 
rf = RandomForestRegressor()
scores = defaultdict(list)
 
#crossvalidate the scores on a number of different random splits of the data
for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]
    r = rf.fit(X_train, Y_train)
    acc = r2_score(Y_test, rf.predict(X_test))
    for i in range(X.shape[1]):
        X_t = X_test.copy()
        np.random.shuffle(X_t[:, i])
        shuff_acc = r2_score(Y_test, rf.predict(X_t))
        scores[names[i]].append((acc-shuff_acc)/acc)
print ("Features sorted by their score:")
print (sorted([(round(np.mean(score), 4), feat) for
              feat, score in scores.items()], reverse=True))

In [None]:
result_train, result_test = train_test_split(result, test_size=0.25)

# Set up our RandomForestClassifier instance and fit to data
clf = RandomForestClassifier(n_estimators=30)
clf.fit(result_train[['event']], result_train["churn"])


# Make predictions
predictions = clf.predict(result_test[['event']])
probs = clf.predict_proba(result_test[['event']])
display(predictions)

score = clf.score(result_test[['event']], result_test["churn"])
print("Accuracy: ", score)

In [None]:
get_ipython().magic('matplotlib inline')
confusion_matrix = pd.DataFrame(
    confusion_matrix(result_test["churn"], predictions), 
    columns=["Predicted False", "Predicted True"], 
    index=["Actual False", "Actual True"]
)
display(confusion_matrix)

# Calculate the fpr and tpr for all thresholds of the classification
fpr, tpr, threshold = roc_curve(result_test["churn"], probs[:,1])
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn import svm
result_train, result_test = train_test_split(result, test_size=0.25)

# Set up our RandomForestClassifier instance and fit to data
clf = svm.SVC(probability=True)
clf.fit(result_train[['event']], result_train["churn"])


# Make predictions
predictions = clf.predict(result_test[['event']])
probs = clf.predict_proba(result_test[['event']])
display(predictions)

score = clf.score(result_test[['event']], result_test["churn"])
print("Accuracy: ", score)

# TO DO:
1. Confusionmatrix
2. Feature importance