In [31]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import accuracy_score , f1_score , precision_score

# load Data

In [32]:
data=pd.read_csv('customer_churn.csv')
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Names            900 non-null    object 
 1   Age              900 non-null    float64
 2   Total_Purchase   900 non-null    float64
 3   Account_Manager  900 non-null    int64  
 4   Years            900 non-null    float64
 5   Num_Sites        900 non-null    float64
 6   Onboard_date     900 non-null    object 
 7   Location         900 non-null    object 
 8   Company          900 non-null    object 
 9   Churn            900 non-null    int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 70.4+ KB


Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1


# Display descriptive of data 

In [33]:
data.describe()

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
count,900.0,900.0,900.0,900.0,900.0,900.0
mean,41.816667,10062.824033,0.481111,5.273156,8.587778,0.166667
std,6.12756,2408.644532,0.499921,1.274449,1.764836,0.372885
min,22.0,100.0,0.0,1.0,3.0,0.0
25%,38.0,8497.1225,0.0,4.45,7.0,0.0
50%,42.0,10045.87,0.0,5.215,8.0,0.0
75%,46.0,11760.105,1.0,6.11,10.0,0.0
max,65.0,18026.01,1.0,9.15,14.0,1.0


# selected columns

In [34]:
cdf=data[['Age','Total_Purchase','Account_Manager','Years','Num_Sites','Churn']]
cdf.head(10)

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
0,42.0,11066.8,0,7.22,8.0,1
1,41.0,11916.22,0,6.5,11.0,1
2,38.0,12884.75,0,6.67,12.0,1
3,42.0,8010.76,0,6.71,10.0,1
4,37.0,9191.58,0,5.56,9.0,1
5,48.0,10356.02,0,5.12,8.0,1
6,44.0,11331.58,1,5.23,11.0,1
7,32.0,9885.12,1,6.92,9.0,1
8,43.0,14062.6,1,5.46,11.0,1
9,40.0,8066.94,1,7.11,11.0,1


# correlation between selected features

In [35]:
cdf.corr()

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
Age,1.0,-0.037208,-0.014749,0.005625,-0.00607,0.085926
Total_Purchase,-0.037208,1.0,0.015856,-0.005623,-0.00339,0.024031
Account_Manager,-0.014749,0.015856,1.0,0.02293,0.033401,0.070611
Years,0.005625,-0.005623,0.02293,1.0,0.051642,0.214329
Num_Sites,-0.00607,-0.00339,0.033401,0.051642,1.0,0.525398
Churn,0.085926,0.024031,0.070611,0.214329,0.525398,1.0


# Checking all null values in Data

In [36]:
print('All null values in Data : ',cdf.isnull().sum().sum())

All null values in Data :  0


# Handling Dupicates Values

In [37]:
cdf.drop_duplicates()

Unnamed: 0,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
0,42.0,11066.80,0,7.22,8.0,1
1,41.0,11916.22,0,6.50,11.0,1
2,38.0,12884.75,0,6.67,12.0,1
3,42.0,8010.76,0,6.71,10.0,1
4,37.0,9191.58,0,5.56,9.0,1
...,...,...,...,...,...,...
895,42.0,12800.82,1,3.62,8.0,0
896,52.0,9893.92,0,6.91,7.0,0
897,45.0,12056.18,0,5.46,4.0,0
898,51.0,6517.93,1,5.47,10.0,0


# Split data

In [38]:
x = cdf.drop(columns='Churn')
y = cdf['Churn']
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3,random_state=42)

# Cross Validation

In [39]:
model=LogisticRegression(max_iter=200)
cv_Scores=cross_val_score(model,train_x,train_y,cv=5)
## cross Validation Scores
print('cross Validation Scores ',cv_Scores)
print('Mean of cross Validation Scores ',cv_Scores.mean())

cross Validation Scores  [0.85714286 0.92063492 0.88095238 0.80952381 0.88888889]
Mean of cross Validation Scores  0.8714285714285716


# Train and Test model 

In [40]:
model.fit(train_x,train_y)
y_predict=model.predict(test_x)
y_predict

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0], dtype=int64)

# Get coefficients and intercept

In [41]:
coefficients = model.coef_
intercept = model.intercept_

print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [[-1.09497681e-01 -1.92838158e-04  4.30480873e-02 -5.58107239e-02
   5.77143053e-01]]
Intercept: [-0.12229246]


# compute Accuracy

In [42]:
Accuracy = accuracy_score(test_y, y_predict)
print(f'Accuracy = {Accuracy*100}%')


Accuracy = 83.7037037037037%


# compute F1_score

In [43]:
F1_Score=f1_score(test_y, y_predict)
print(f'F1_Score = {F1_Score*100}%')

F1_Score = 24.137931034482765%


# compute Precision

In [44]:
Precision=precision_score(test_y, y_predict)
print(f'Precision = {Precision*100}%')

Precision = 46.666666666666664%


# predict  Churn on new Data 

In [45]:
new_data=pd.read_csv('new_customers.csv',index_col=False)
new_data.drop_duplicates()
new_cdf=new_data[['Age','Total_Purchase','Account_Manager','Years','Num_Sites']]
predictions=model.predict(new_cdf)
predictions


array([0, 1, 1, 1, 0, 1], dtype=int64)

# write Predictions in Excel 

In [46]:
# Add predictions to new_data DataFrame
new_data['Churn'] = predictions

# Write the new_data DataFrame to an Excel file
new_data.to_excel('new_customers.xlsx', index=False)

# Read the Excel file
df = pd.read_excel('new_customers.xlsx')

df.head(7)


Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Andrew Mccall,37,9935.53,1,7.71,8,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd,0
1,Michele Wright,23,7526.94,1,9.28,15,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME...",Cannon-Benson,1
2,Jeremy Chang,65,100.0,1,1.0,15,2006-12-11 07:48:13,"085 Austin Views Lake Julialand, WY 63726-4298",Barron-Robertson,1
3,Megan Ferguson,32,6487.5,0,9.4,14,2016-10-28 05:32:13,"922 Wright Branch North Cynthialand, NC 64721",Sexton-Golden,1
4,Taylor Young,32,13147.71,1,10.0,8,2012-03-20 00:36:46,Unit 0789 Box 0734 DPO AP 39702,Wood LLC,0
5,Jessica Drake,22,8445.26,1,3.46,14,2011-02-04 19:29:27,1148 Tina Stravenue Apt. 978 South Carlos TX 2...,Parks-Robbins,1
