ANALYZE, VISUALIZE AND PREPARE DATA

In [1]:
#Import necessarry libraries

import numpy as np 
import pandas as pd 
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#read dataset
df = pd.read_csv("customer.csv", encoding='unicode_escape')

In [None]:
#Quick look dataset
df.head(15)

In [None]:
# knowing its shape
df.shape

In [None]:
# getting its basic information
df.info()

In [None]:
df.describe().T

In [None]:
df.columns

In [8]:
#Rename wrong column name
df.rename(columns ={'Country;;;;;;':'Country'},inplace=True)

In [None]:
#Fix wrong values
df['Country'] = df['Country'].str.replace(';', '')
df.head(15)

In [None]:
#Fix wrong values
df['Description'] = df['Description'].str.replace('.', '')
df.head(15)

In [None]:
# Look top 10 products that sold
df["StockCode"].value_counts().head()

In [None]:
# visualizing top 10 products that sold using countplot
top10_product = df['StockCode'].value_counts().head(10).index.tolist()

plt.figure(figsize= (15, 8))
sns.countplot(x= 'StockCode', data= df, order= top10_product,  palette= "Set2")
plt.title('Top 10 Products')
plt.tight_layout()
plt.show()

In [None]:
# chceking for null values
df.isnull().sum()

In [14]:
#Delete empties
df.dropna(inplace = True)

In [None]:
# chceking for null values
df.isnull().sum()

In [None]:
# checking for redundancy
df.duplicated().sum()

In [17]:
# dropping redundancy
df.drop_duplicates(keep= 'first', inplace= True)

In [None]:
# checking for redundancy
df.duplicated().sum()

In [None]:
# getting its basic information
df.info()

In [None]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

df = df[~df["InvoiceNo"].str.contains("C", na=False)]

# Quantity is very low of number due to cancellation of invoice that includes letter of "C"

df.describe().T

In [None]:
df.dtypes

In [None]:
# Change data type
#Categorical value
df['InvoiceNo'] = df['InvoiceNo'].astype(object)

df.dtypes

In [None]:
# Change data type
df['Quantity'] = df['Quantity'].astype(int)

df.dtypes

In [None]:
# Change data type
#categorical value
df['CustomerID'] = df['CustomerID'].astype(object)

df.dtypes

In [None]:
# visualizing top 10 frequent customer using countplot
top10_customer = df['CustomerID'].value_counts().head(10).index

plt.figure(figsize= (8,5))
sns.countplot(x= 'CustomerID', data= df, order= top10_customer,  palette= 'crest')
plt.title('Top 10 Customer')
plt.tight_layout()
plt.show()

In [None]:
#Check value counts
df.count()

In [None]:
#Check mathematical data
df.describe().T

In [28]:
#Create new colunm for costumer analysis
df["TotalPrice"]=df["Quantity"]*df["UnitPrice"]

In [None]:
#Total price by country
df.groupby("Country").agg({"TotalPrice":"sum"}).sort_values("TotalPrice", ascending=False ).head()

In [None]:
# visualizing total sales perday using lineplot
sales_perday = df.groupby('InvoiceDate')['TotalPrice'].sum().sort_values(ascending= False).to_frame().reset_index()

plt.figure(figsize= (15,5))
sns.lineplot(x= 'InvoiceDate', y= 'TotalPrice', data= sales_perday, color= '#0a437a')
plt.title('Total Sales per day')
plt.show()

In [None]:
#Top ten country's customer
df.Country.value_counts()[:10].plot(kind='bar')

In [None]:
#Top ten country by total price
df.groupby("Country").agg({"TotalPrice":"sum"}).sort_values("TotalPrice", ascending=False )[:10].plot(kind='bar')

In [None]:
df.head()

In [None]:
#Check mathematical data
df.describe().T

In [None]:
# Customer purchased products at last time

df["InvoiceDate"].max()

In [None]:
# getting the latest invoice date in the dataset
latest_invoice_date = df['InvoiceDate'].max()
latest_invoice_date

In [None]:
# invoice date before three months of latest date
mon3_ret_date = pd.Timestamp('2011-09-01 12:50:00')
mon3_ret_date

In [38]:
# taking the first part of data that doesn't have last 3 months of transaction

df_part1 = df.copy() # creating copy
df_part1.set_index('InvoiceDate', inplace= True) # setting Date as index
df_part1 = df_part1.loc[:'2011-09-09 12:50:00'] # slicing the data

In [39]:
# reseting the index
df_part1.reset_index(inplace= True)

In [None]:
df_part1.info()

In [None]:
df_part1.head()

RFM ANALYSIS

In [None]:
# calculating the recency of each customer

recency = df_part1.groupby('CustomerID').agg({'InvoiceDate': lambda x : 
                                              (latest_invoice_date - x.max()).days}).reset_index() # calculating recency

recency.rename(columns= {'InvoiceDate':'Recency'}, inplace= True) # renaming columns
recency

In [None]:
# calculating the frequency of each customer

frequency = df_part1.groupby('CustomerID').agg({'CustomerID':'count'}) # calculating frequency

frequency.rename(columns= {'CustomerID':'Frequency'}, inplace= True) # renaming columns
frequency.reset_index(inplace= True) # resetting index

frequency

In [None]:
# calculating the monetary of each customer

monetary = df_part1.groupby('CustomerID').agg({'TotalPrice':'sum'}).reset_index() # calculting monetary

monetary.rename(columns= {'TotalPrice':'Monetary'}, inplace= True) # renaming columns
monetary

In [None]:
# creating a new dataframe RFM with recency, frequency and monetary of each customer
rfm = pd.concat([recency.iloc[:,:], frequency.iloc[:,-1], monetary.iloc[:,-1]], axis= 1)
rfm.head()

In [None]:
# describing feature variables of RFM dataframe
rfm.describe()

In [None]:
# visualizing the distribution of feature variables in RFM dataframe using distplot
fig, axis = plt.subplots(3,1, figsize= (6,8))

for i,feature in enumerate(['Recency', 'Frequency', 'Monetary']):
    sns.distplot(x= rfm[feature], kde= True, ax= axis[i], color= '#7e4071', axlabel= feature)

plt.tight_layout()
plt.show()

In [None]:
'''
# Recency values should be higher than 1, so today_date can be choosen maximum value +2

today_date = pd.datetime(2011, 12, 11)

rfm = df.groupby("CustomerID").agg({"InvoiceDate": lambda date: (today_date - date.max()).days,
                                     "InvoiceNo": lambda InvoiceNo: InvoiceNo.nunique(),
                                     "TotalPrice":lambda TotalPrice: TotalPrice.sum()})

rfm.columns = ["Recency", "Frequency", "Monetary"]

rfm = rfm[rfm["Monetary"] > 0]

rfm.describe().T
'''

In [None]:
import seaborn as sns

sns.set(rc = {"figure.figsize" : (18,5)})

sns.boxplot(rfm["Monetary"], palette="summer")

In [None]:
sns.set(rc = {"figure.figsize" : (18,5)})

sns.boxplot(rfm["Frequency"], palette="summer")

In [None]:
sns.set(rc = {"figure.figsize" : (18,5)})

sns.boxplot(rfm["Recency"], palette="summer")

In [None]:
#Frequency, recency, monetary are scaled to with 'qcut' function for assesting to these values.
rfm["Recency_score"] = pd.cut(rfm["Recency"], 5, labels=[5,4,3,2,1])

rfm["Monetary_score"] = pd.cut(rfm["Monetary"], 5, labels=[1,2,3,4,5])

rfm["Frequency_score"] = pd.cut(rfm["Frequency"].rank(method="first"), 5, labels=[1,2,3,4,5])

rfm["RFM_SCORE"] = (rfm["Recency_score"].astype(int) + rfm["Frequency_score"].astype(int) + rfm["Monetary_score"].astype(int))

rfm.head(10)

In [None]:
rfm.dtypes

In [None]:
import seaborn as sns

sns.pairplot(rfm,hue="Frequency_score")

In [None]:
import seaborn as sns

sns.pairplot(rfm,hue="Recency_score")

In [56]:
# importing required libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
rfm.head()

In [None]:
rfm.dtypes

In [59]:
# assigning the required independent feature variables of RFM dataframe to X_rfm variable
X_rfm = rfm[['Recency_score', 'Frequency_score', 'Monetary_score', 'RFM_SCORE']]

In [60]:
# standardizing the data with StandardScaler
std_scaler = StandardScaler()
X_rfm = std_scaler.fit_transform(X_rfm)

In [61]:
# building k-means clustering model and by elbow curve method finding the optimal cluster value

list_wcss = [] # empty list to store Within Cluster Sum of Square values

k = range(1,11)

for i in k:
    
    kmeans = KMeans(n_clusters= i, random_state= 42) # building k-means clustering model 
    kmeans.fit(X_rfm) # fitting data into model
    
    list_wcss.append(kmeans.inertia_) # appending WCSS value to list_wcss



In [None]:
# ploting wcss against k to find optimal k value
plt.plot(k, list_wcss, 'rD--')
plt.xlabel('Number of clusters (K)')
plt.ylabel('WCSS')
plt.title('Elbow Curve')
plt.show()

In [None]:
# segmenting each customer into different segments based on their RFM scores

rfm['Seg_Num'] = pd.cut(rfm['RFM_SCORE'], bins= [0, 6, 10, 18], 
                               labels= [3,2,1]) # getting Segment_Number for each customer

rfm['Segment_Label'] = pd.cut(rfm['RFM_SCORE'], bins= [0, 6, 10, 18], 
                              labels= ['Basic Customer', 'Standard Customer',
                                       'Prime Customer']) # getting Segment_Label for each customer

rfm.head()


In [None]:
# understanding the different segment of customers with median value
rfm.groupby('Segment_Label')[['Recency', 'Frequency', 'Monetary']].median().round().reset_index()



In [None]:
# visualizing the RFM Scores of different segment of customers using strip plot
sns.stripplot(x= 'Seg_Num', y= 'RFM_SCORE', data= rfm, hue= 'Segment_Label', palette= 'mako', jitter=False)
plt.title('RFM Scores of each Segment')
plt.show()

In [None]:
# visualizing the distribution of different segment of customers with their RFM score using violin plot
sns.violinplot(x= 'Seg_Num', y= 'RFM_SCORE', data= rfm, hue= 'Segment_Label', palette= 'mako')
plt.title('Distribution of RFM Scores of each Segment')
plt.show()

In [None]:
# visualizing the different segment of customers and thier recency, frequency and monetary values using strip plot
fig, axis = plt.subplots(3,1, figsize= (5,10))

for i,feature in enumerate(['Recency', 'Frequency', 'Monetary']):
    sns.stripplot(x= 'Seg_Num', y= feature, data= rfm, hue= 'Segment_Label', palette= 'mako', ax= axis[i])

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# knowing the percentage of each segment of customers using pie chart
segment_count = rfm['Segment_Label'].value_counts()

plt.pie(segment_count.values, labels= segment_count.index, autopct='%1.1f%%')
plt.title('Percentage of Customer in each Segment')
plt.show()

CHURN ANALYSIS

In [69]:
# taking the second part of data that have only last 3 months of transaction

df_part2 = df.copy() # creating copy
df_part2.set_index('InvoiceDate', inplace= True) # setting Date as index

df_part2 = df_part2.loc['2011-09-01 12:50:00':] # slicing the data
df_part2.reset_index(inplace= True) # resetting index

In [None]:
# viewing the data of df_part2
df_part2.head()

In [None]:
df_part2.info()

In [None]:
# knowing the number of customers in df_part2
df_part2['CustomerID'].nunique()

In [None]:
# knowing the number of customers in df_part2
df_part1['CustomerID'].nunique()

In [74]:
# getting the number of customers in part1 and part2
part1_customer = df_part1['CustomerID'].sort_values().unique()
part2_customer = df_part2['CustomerID'].sort_values().unique()

In [None]:
# finding how many new customers in part2

New_customers = [] # empty list to store new customers ID

for i in part2_customer:
    if i in part1_customer: # checking customer of part2 data in part1 data
        pass
    else:
        New_customers.append(i) # else appending customer ID to New_customers
        
print(f'Total Number of New Customers: {len(New_customers)}')

In [76]:
# finding how many old customers made transaction in last three months

R_next_3months = [] # empty list to store customer ID

for i in part1_customer:
    if i in part2_customer: # checking customer of part1 data in part2 data 
        R_next_3months.append('Yes') # if true append Yes

    else:
        R_next_3months.append('No') # else append No


# R_next_3months listesi oluşturuluyor
#R_next_3months = ['Yes' if customer_id in part2_customer else 'No' for customer_id in rfm.index]

In [None]:
len(R_next_3months)

In [None]:
len(rfm)

In [79]:
rfm['R_Next_3Months'] =  R_next_3months # adding the new feature variable

In [None]:
# viewing RFM dataframe
rfm.head(15)

In [None]:
rfm['R_Next_3Months'].isnull().value_counts()

In [None]:
rfm['R_Next_3Months'].value_counts()

In [83]:
# finding whether the customer is churned or not based on conditions

Churn = [] # empty list to store the status of churn of customer

for i,j in enumerate(rfm['CustomerID']):
    
    if rfm['Recency'][i] <= 90 and rfm['R_Next_3Months'][i] == 'Yes': 
        Churn.append('No')
        
    elif rfm['Recency'][i] <= 90 and rfm['R_Next_3Months'][i] == 'No':
        
        if rfm['Frequency'][i] <= 15:
            Churn.append('High Risk')
            
        else:
            Churn.append('Low Risk')
            
    elif rfm['Recency'][i] > 90 and rfm['R_Next_3Months'][i] == 'Yes':
        
        if rfm['Frequency'][i] > 15:
            Churn.append('No')
            
        else:
            Churn.append('Low Risk')
            
    elif rfm['Recency'][i] > 90 and rfm['R_Next_3Months'][i] == 'No':
        Churn.append('Yes')
        
rfm['Churn'] = Churn # adding the new feature variable churn

In [None]:
rfm.head()

In [None]:
rfm['Churn'].value_counts()

In [None]:
# knowing the percentage of each class in churn using pie chart
churn_count = rfm['Churn'].value_counts()

plt.pie(churn_count.values, labels= churn_count.index, autopct='%1.1f%%')
plt.title('Churn')
plt.show()

In [None]:
# visualizing different segment of customers and thier churn class
sns.countplot(x= 'Churn', data= rfm, hue= 'Segment_Label', palette= 'rocket')
plt.title('Count of Customers in each Segment with Churn Class')
plt.show()

In [None]:
# visualizing and knowing the percentage of churn class for different segment of customers 
segment_list = rfm['Segment_Label'].unique().sort_values(ascending= False)

for i in segment_list:
    segment = rfm[['Segment_Label','Churn']][rfm.Segment_Label == i]
    segment_churn = segment.value_counts().to_frame().reset_index().rename(columns= {0:'count'})
    
    plt.pie(x= segment_churn['count'], labels= segment_churn['Churn'], autopct= '%.1f%%')
    plt.title(i)
    plt.tight_layout()
    plt.show()

In [None]:
rfm.head()

In [None]:
rfm.info()

In [91]:
# converting the datatype of categorical feature from int to object
rfm[['CustomerID', 'Seg_Num', 'Segment_Label']] = rfm[['CustomerID', 'Seg_Num', 'Segment_Label']].astype(object)

In [92]:
# converting the datatype of categorical feature from int to object
rfm[['Recency_score', 'Monetary_score', 'Frequency_score']] = rfm[['Recency_score', 'Monetary_score', 'Frequency_score']].astype(float)

In [None]:
rfm.dtypes

In [None]:
# creating correlation matrix for the numeric feature variables
corr = rfm[['Recency', 'Frequency', 'Monetary', 'Recency_score', 'Monetary_score', 'Frequency_score', 'RFM_SCORE']].corr()
corr

In [None]:
# visualizing the correlation between numeric feature variables using heat map
plt.figure(figsize= (8,6))
sns.heatmap(data= corr, fmt= '.2f', linewidths= 0.2, linecolor= 'white', cmap= 'Blues', annot= True)
plt.tight_layout()
plt.show()

In [96]:
# getting the variance in each features
#rfm.var()

In [None]:
# dropping unwanted and multicollinearity feature variables
df_segment = rfm.copy() # copy of RFM dataframe
df_segment.drop(['CustomerID', 'Recency_score', 'Frequency_score', 'Monetary_score', 'Seg_Num'], axis= 1, inplace= True) # dropping
df_segment.head()

In [None]:
# visualizing and getting know whether independent feature variables having outlier using box plot
num_features = df_segment.select_dtypes(exclude= 'object').columns

fig, axis = plt.subplots(1,4,figsize=(10,3))
axis = axis.flatten()

for i,feature in enumerate(num_features):
    sns.boxplot(y= feature, data= df_segment, ax= axis[i], color= '#4c9085')

plt.tight_layout()
plt.show()

In [99]:
# getting the feature variables having outliers
outliers_features = ['Frequency', 'Monetary']

In [100]:
# removing outliers using zscore 
from scipy import stats
zscore_frequency = np.abs(stats.zscore(df_segment['Frequency'])) # calculating Z-score for frequency
zscore_monetary = np.abs(stats.zscore(df_segment['Monetary'])) # calculating Z-score for monetary

threshold = 3 # setting threshold value

outliers_frequency = list(np.where(zscore_frequency>threshold)[0]) # getting outliers index in frequency
outliers_monetary = list(np.where(zscore_monetary>threshold)[0]) # getting outliers index in monetary

outliers_indices = list(set(outliers_frequency + outliers_monetary)) # creating a set for getting unique index of outliers
outliers_indices.sort() # sorting the list

df_segment = df_segment.drop(df_segment.index[outliers_indices]) # dropping outlier records

In [None]:
print(df_segment)

In [102]:
# encoding categorical variable using Label Encoder
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_segment['R_Next_3Months'] = labelencoder.fit_transform(df_segment['R_Next_3Months'])

In [None]:
df_segment.head()

In [104]:
df_segment['Segment_Label'] = labelencoder.fit_transform(df_segment['Segment_Label'])

In [None]:
df_segment.head()

In [106]:
df_segment['Churn'] = labelencoder.fit_transform(df_segment['Churn'])

In [None]:
df_segment.head()

In [108]:
# assigning variables for independent and dependent feature variables

X = df_segment.drop(['Churn'], axis= 1) # independent feature variables
y = df_segment[['Churn']] # dependent feature variables

In [None]:
# standardization of independent variables
'''
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X = std.fit_transform(X)
'''

In [110]:
# Train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
# Replacing the 0 values with mean
'''
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = 0, strategy='mean')

X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)
'''


In [None]:
# Lets see the shape of our train and test datasets

print('Shape training set: X:{}, y:{}'.format(X_train.shape, y_train.shape))
print('Shape test set: X:{}, y:{}'.format(X_test.shape, y_test.shape))

In [113]:
# Modeling Libraries
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split

In [114]:
def evaluate_model(models):
    """
    Takes a list of models and returns chart of cross validation scores using mean accuracy
    """
    
    # Cross validate model with Kfold stratified cross val
    kfold = StratifiedKFold(n_splits = 10)
    
    result = []
    for model in models:
        result.append(cross_val_score(estimator=model, X=X_train, y=y_train, scoring="accuracy", cv=kfold, n_jobs=1))  # n_jobs=1

    cv_means = []
    cv_std = []
    for cv_result in result:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    result_df = pd.DataFrame({
        "CrossValMeans": cv_means,
        "CrossValerrors": cv_std,
        "Models": [
            "LogisticRegression",
            "DecisionTreeClassifier",
            "AdaBoostClassifier",
            "SVC",
            "RandomForestClassifier",
            "GradientBoostingClassifier",
            "KNeighborsClassifier"
        ]
    })

    # Generate chart
    bar = sns.barplot(x="CrossValMeans", y="Models", data=result_df, orient="h")
    bar.set_xlabel("Mean Accuracy")
    bar.set_title("Cross validation scores")
    return result_df


In [None]:
# Modeling step Test differents algorithms 
random_state = 42
models = [
    LogisticRegression(random_state = random_state, solver='liblinear'),
    DecisionTreeClassifier(random_state = random_state),
    AdaBoostClassifier(DecisionTreeClassifier(random_state = random_state), random_state = random_state, learning_rate = 0.2),
    SVC(random_state = random_state),
    RandomForestClassifier(random_state = random_state),
    GradientBoostingClassifier(random_state = random_state),
    KNeighborsClassifier(),
]
evaluate_model(models)

In [116]:
# Import libraries
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
def analyze_grid_result(grid_result):

    # Best parameters and accuracy
    print("Tuned hyperparameters: (best parameters) ", grid_result.best_params_)
    print("Accuracy :", grid_result.best_score_)
    
    means = grid_result.cv_results_["mean_test_score"]
    stds = grid_result.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, grid_result.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
    print("Detailed classification report:")
    y_true, y_pred = y_test, grid_result.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
# Define models and parameters for LogisticRegression
model = LogisticRegression(solver='liblinear')
solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# Define grid search
grid = dict(solver = solvers, penalty = penalty, C = c_values)
cv = StratifiedKFold(n_splits = 50, random_state = 1, shuffle = True)
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = cv, scoring = 'accuracy', error_score = 0)
logi_result = grid_search.fit(X_train, y_train)
# Logistic Regression Hyperparameter Result
analyze_grid_result(logi_result)

In [None]:
# Initializing the model with the best parameters

model = LogisticRegression(C=0.1, penalty='l2', solver='liblinear')
model = model.fit(X_train, y_train)

In [None]:
# Model Prediction and Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))