In [1]:
# Pip install xlrd pandas sklearn # Use it if some libraries are missing

# Import Necessary libraries
import pandas as pd # for dataframe

from sklearn.model_selection import train_test_split# Easy way to split the data as traning and test

# Import Machine Learning Algorithms
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_excel("Kundenabwanderung.xlsx") # import the Excel-sheet
df["Umsatz"].fillna(df["Umsatz"].median(), inplace=True) # remove NaN and place the Median Value of "Umzatz" column 
df["Land"] = df["Land"].factorize()[0] # give numbers as a label instead Lands 
df["Geschlecht"] = df["Geschlecht"].factorize()[0] # give numbers as a label instead M F 
df.head()

Unnamed: 0,RowNr,KundenID,Nachname,BonitaetsScore,Land,Geschlecht,Alter,Laufzeit,Umsatz,AnzahlProdukte,ZahlungPerKreditkarte,NewsletterAbo,Einkommen,Gekuendigt6M
0,1,15634602,Hargrave,619,0,0,42,2,119839.69,1,1,1,101300,1
1,2,15647311,Hill,608,1,0,41,1,83807.86,1,0,1,112500,0
2,3,15619304,Onio,502,0,0,42,8,159660.8,3,1,0,113900,1
3,4,15701354,Boni,699,0,0,39,1,119839.69,2,0,0,93800,0
4,5,15737888,Mitchell,850,1,0,43,2,125510.82,1,1,1,79100,0


In [3]:
# Percentage of Gekuendigt6M 0 = Nicht Gekuendigt 1= Gekuendigt 
df["KundenID"].groupby(df["Gekuendigt6M"]).count()/df["KundenID"].count()*100 

Gekuendigt6M
0    79.63
1    20.37
Name: KundenID, dtype: float64

In [4]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()

X = df.drop(["RowNr","KundenID", "Nachname", "Gekuendigt6M"], axis=1) # Choose the feratures
X = scaler.fit_transform(X) # Scaling the date between 0 and 1 (not necessary but it gives more accuracy)

y = df["Gekuendigt6M"].values # Choose the labels
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.50, random_state=100) # Split the data


# Logistic Regression Classifier

In [5]:
LR = LogisticRegression(solver="sag", max_iter=100000, multi_class='auto')
LR.fit(X_train, y_train)
prediction = LR.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 80.9%


# Linear Support Vector Classifier

In [6]:
LSVC = LinearSVC()
LSVC.fit(X_train, y_train)
prediction = LSVC.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 80.32%


# Multinomial Naive Bayes Classifier

In [7]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
prediction = MNB.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 79.54%


# Bernoulli Naive Bayes Classifier

In [8]:
BNB = BernoulliNB()
BNB.fit(X_train, y_train)
prediction = BNB.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 79.62%


# PCA

In [9]:
# PCA to reduce the fearues into 2 columns 
from sklearn.decomposition import PCA

pca_model = PCA(n_components=2)
pca_model.fit(X_train)
X_train = pca_model.transform(X_train)
X_test = pca_model.transform(X_test)

# KNN Classifier

In [10]:
K = 70
KNN = KNeighborsClassifier(n_neighbors = K, weights = 'uniform',algorithm = 'auto', leaf_size=50)
KNN.fit(X_train, y_train)
prediction = KNN.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 79.54%


# Stochastic Gradient Descent

In [11]:
SGD = SGDClassifier(loss='squared_hinge',  alpha=0.0001, tol=0.1)
SGD.fit(X_train, y_train)
prediction = SGD.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 79.54%


# Gradient Boost Classifier

In [12]:
# It's worse with a scaler
GB = GradientBoostingClassifier()
GB.fit(X_train, y_train)
prediction = GB.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 79.4%


# Random Forest Classifier

In [13]:
# It's worse with a scaler
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
prediction = RF.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 75.02%
