### Telco customer churn project

In [239]:
import pandas as pd

data = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')

data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [240]:
# checking data types

data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [241]:
# getting data ready

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

data['Churn'].unique()
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

data.isnull().sum()
len(data)
# Total charges is null in 11 cases - removing those cases
data = data.dropna()


In [242]:
X = data.drop(columns=['customerID', 'Churn'])
y = data['Churn']

In [243]:
# encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['gender', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
one_hot = OneHotEncoder(sparse_output=False)
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
transformed_X = transformer.fit_transform(X)

# Saving the names of colums
encoded_feature_names = transformer.named_transformers_['one_hot'].get_feature_names_out(categorical_features)
all_feature_names = list(encoded_feature_names) + [col for col in X.columns if col not in categorical_features]

X_transformed_df = pd.DataFrame(transformed_X, columns=all_feature_names)
X_transformed_df.head()

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,tenure_1,tenure_2,tenure_3,tenure_4,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,MonthlyCharges,TotalCharges
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,29.85,29.85
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,56.95,1889.5
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,53.85,108.15
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,42.3,1840.75
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,70.7,151.65


In [255]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, test_size=.2)

In [256]:
X_train.head()

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,tenure_1,tenure_2,tenure_3,tenure_4,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,MonthlyCharges,TotalCharges
872,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,99.7,4634.35
4907,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,39.2,849.9
1096,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,24.8,324.15
6779,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,25.6,1790.35
1794,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,98.7,973.25


In [257]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

model_svm = svm.SVC()
model_svm.fit(X_train, y_train)
model_svm.score(X_test, y_test)

0.7299218194740583

In [258]:
from sklearn.metrics import classification_report

y_svm_preds = model_svm.predict(X_test)

classification_report_svm = classification_report(y_test, y_svm_preds)

print(classification_report_svm)

# low precision with svm

              precision    recall  f1-score   support

           0       0.73      1.00      0.84      1027
           1       0.00      0.00      0.00       380

    accuracy                           0.73      1407
   macro avg       0.36      0.50      0.42      1407
weighted avg       0.53      0.73      0.62      1407



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [259]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_model.score(X_test, y_test)

0.7967306325515281

In [260]:
from sklearn.metrics import classification_report

y_rfc_preds = rfc_model.predict(X_test)
classification_report_rfc = classification_report(y_test, y_rfc_preds)
print(classification_report_rfc)

# better metrics with random forest clas.

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1027
           1       0.68      0.48      0.56       380

    accuracy                           0.80      1407
   macro avg       0.75      0.70      0.71      1407
weighted avg       0.78      0.80      0.78      1407



In [261]:
# further exploration of rfc metrics

pd.crosstab(y_test, y_rfc_preds, rownames=["Actual label"], colnames=["Predicted Labels"])

Predicted Labels,0,1
Actual label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,940,87
1,199,181


In [262]:
# oversampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

X_resampled_df = pd.DataFrame(X_resampled, columns=X_train.columns)
y_resampled_df = pd.Series(y_resampled, name="Churn")
y_resampled_df.value_counts()

0    4136
1    4136
Name: Churn, dtype: int64

In [263]:
from sklearn.ensemble import RandomForestClassifier

rfc_model_rsm = RandomForestClassifier()
rfc_model_rsm.fit(X_resampled, y_resampled)
rfc_model_rsm.score(X_test, y_test)

0.7810945273631841

In [264]:
from sklearn.metrics import classification_report

y_rfc_preds_rsm = rfc_model_rsm.predict(X_test)

classification_report_oversampled = classification_report(y_test, y_rfc_preds_rsm )
print(classification_report_oversampled)

              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1027
           1       0.62      0.47      0.54       380

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [265]:
# undersampling
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler(random_state=42)

X_undersampled, y_undersampled = undersampler.fit_resample(X_train, y_train)

X_underresampled_df = pd.DataFrame(X_undersampled, columns=X_train.columns)
y_underresampled_df = pd.Series(y_undersampled, name="Churn")
print(len(X_underresampled_df))

2978


In [266]:
from sklearn.ensemble import RandomForestClassifier

rfc_model_undersampled = RandomForestClassifier(n_estimators=10, min_samples_split=2, min_samples_leaf=2, max_features='log2', max_depth=5)
rfc_model_undersampled.fit(X_undersampled, y_undersampled)
rfc_model_rsm.score(X_test, y_test)

0.7810945273631841

In [267]:
from sklearn.metrics import classification_report

y_rfc_preds_undersampled = rfc_model_undersampled.predict(X_test)

classification_report_undersampled = classification_report(y_test, y_rfc_preds_undersampled)
print(classification_report_undersampled)

              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1027
           1       0.50      0.84      0.63       380

    accuracy                           0.73      1407
   macro avg       0.71      0.77      0.71      1407
weighted avg       0.81      0.73      0.75      1407



In [117]:
# Results for different aproaches
print('SVM')
print(classification_report_svm)
print('Random Forest Classification')
print(classification_report_rfc)
print('Random Forest Classification Oversampled')
print(classification_report_oversampled)
print('Random Forest Classification Undersampled')
print(classification_report_undersampled)

SVM
              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1055
           1       0.00      0.00      0.00       352

    accuracy                           0.75      1407
   macro avg       0.37      0.50      0.43      1407
weighted avg       0.56      0.75      0.64      1407

Random Forest Classification
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1055
           1       0.57      0.48      0.52       352

    accuracy                           0.78      1407
   macro avg       0.70      0.68      0.69      1407
weighted avg       0.77      0.78      0.77      1407

Random Forest Classification Oversampled
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1055
           1       0.57      0.49      0.53       352

    accuracy                           0.78      1407
   macro avg       0.70      0.68      0.69      1407


# No significant difference between different aproaches in score but huge in recall

In [120]:
# shuffling data

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

data = data.sample(frac=1)

# getting data ready

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

data['Churn'].unique()
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

data.isnull().sum()
len(data)
# Total charges is null in 11 cases - removing those cases
data = data.dropna()

# encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['gender', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
one_hot = OneHotEncoder(sparse_output=False)
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
transformed_X = transformer.fit_transform(X)

# Saving the names of colums
encoded_feature_names = transformer.named_transformers_['one_hot'].get_feature_names_out(categorical_features)
all_feature_names = list(encoded_feature_names) + [col for col in X.columns if col not in categorical_features]

X_transformed_df = pd.DataFrame(transformed_X, columns=all_feature_names)
X_transformed_df.head()


from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_model.score(X_test, y_test)

0.7896233120113717

In [219]:
# RandomizedSeacrhCV
from sklearn.model_selection import RandomizedSearchCV

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
       'max_depth': [None, 5, 10, 20, 30],
        'max_features': ['log2', 'sqrt'],
        'min_samples_split': [2,4,6],
        'min_samples_leaf': [1,2,4]
       }

clf = RandomForestClassifier(n_jobs=None)
rs_clf = RandomizedSearchCV(estimator=clf, param_distributions=grid, n_iter=10, cv=5, verbose=2)

rs_clf.fit(X_undersampled, y_undersampled)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=10; total time=   0.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=4, n_estimator

In [220]:
print(rs_clf.best_params_)
print(rs_clf.best_score_)

{'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5}
0.751183209110429


In [184]:
# Feature eng.

data = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.sample(frac=1)
data = data.dropna()
# 1. Family status
data['FamilyStatus'] = 'Single'
data.loc[(data['Partner'] == 'Yes') | (data['Dependents'] == 'Yes'), 'FamilyStatus'] = 'Family'

# 2. Company tenure
data['CompanyTenureCategory'] = pd.cut(data['tenure'], bins=[0, 12, 36, 60, data['tenure'].max()], labels=['New', 'Medium', 'Long', 'Very Long'])

# 3. Services customer is using
data['ServicesUsed'] = 'None'
data.loc[(data['InternetService'] != 'No') & (data['PhoneService'] != 'No'), 'ServicesUsed'] = 'Internet and Phone'
data.loc[(data['InternetService'] != 'No') & (data['PhoneService'] == 'No'), 'ServicesUsed'] = 'Internet Only'
data.loc[(data['InternetService'] == 'No') & (data['PhoneService'] != 'No'), 'ServicesUsed'] = 'Phone Only'

# 4. Level of online security
data['SecurityLevel'] = 'Low'
data.loc[(data['OnlineSecurity'] == 'Yes') & (data['TechSupport'] == 'Yes'), 'SecurityLevel'] = 'High'

# 5. Overview of streaming services
data['StreamingActivity'] = 'Not Active'
data.loc[(data['StreamingTV'] == 'Yes') | (data['StreamingMovies'] == 'Yes'), 'StreamingActivity'] = 'Active'

# 6. Contract type
data['ContractPayment'] = data['Contract'] + ' - ' + data['PaymentMethod']

featured_data = data[['MonthlyCharges', 'TotalCharges', 'FamilyStatus', 'CompanyTenureCategory', 'ServicesUsed', 'SecurityLevel', 'StreamingActivity', 'ContractPayment', 'Churn']]

featured_data = featured_data.dropna()
featured_data.head()

Unnamed: 0,MonthlyCharges,TotalCharges,FamilyStatus,CompanyTenureCategory,ServicesUsed,SecurityLevel,StreamingActivity,ContractPayment,Churn
0,29.85,29.85,Family,New,Internet Only,Low,Not Active,Month-to-month - Electronic check,No
1,56.95,1889.5,Single,Medium,Internet and Phone,Low,Not Active,One year - Mailed check,No
2,53.85,108.15,Single,New,Internet and Phone,Low,Not Active,Month-to-month - Mailed check,Yes
3,42.3,1840.75,Single,Long,Internet Only,High,Not Active,One year - Bank transfer (automatic),No
4,70.7,151.65,Single,New,Internet and Phone,Low,Not Active,Month-to-month - Electronic check,Yes


In [185]:
# getting data ready

featured_data['TotalCharges'] = pd.to_numeric(featured_data['TotalCharges'], errors='coerce')

featured_data['Churn'] = featured_data['Churn'].map({'Yes': 1, 'No': 0})

X = featured_data.drop('Churn', axis=1)
y = featured_data['Churn']

# encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['FamilyStatus', 'CompanyTenureCategory', 'ServicesUsed', 'SecurityLevel', 'StreamingActivity', 'ContractPayment']
one_hot = OneHotEncoder(sparse_output=False)
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
transformed_X = transformer.fit_transform(X)

# Saving the names of colums
encoded_feature_names = transformer.named_transformers_['one_hot'].get_feature_names_out(categorical_features)
all_feature_names = list(encoded_feature_names) + [col for col in X.columns if col not in categorical_features]

X_transformed_df = pd.DataFrame(transformed_X, columns=all_feature_names)
X_transformed_df.head()

Unnamed: 0,FamilyStatus_Family,FamilyStatus_Single,CompanyTenureCategory_Long,CompanyTenureCategory_Medium,CompanyTenureCategory_New,CompanyTenureCategory_Very Long,ServicesUsed_Internet Only,ServicesUsed_Internet and Phone,ServicesUsed_Phone Only,SecurityLevel_High,...,ContractPayment_One year - Bank transfer (automatic),ContractPayment_One year - Credit card (automatic),ContractPayment_One year - Electronic check,ContractPayment_One year - Mailed check,ContractPayment_Two year - Bank transfer (automatic),ContractPayment_Two year - Credit card (automatic),ContractPayment_Two year - Electronic check,ContractPayment_Two year - Mailed check,MonthlyCharges,TotalCharges
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.85,29.85
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,56.95,1889.5
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.85,108.15
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.3,1840.75
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.7,151.65


In [186]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, test_size=.2)

from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_model.score(X_test, y_test)

0.767590618336887

In [189]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1041
           1       0.40      0.53      0.45       366

    accuracy                           0.67      1407
   macro avg       0.60      0.62      0.61      1407
weighted avg       0.70      0.67      0.68      1407



In [195]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1041
           1       0.40      0.53      0.45       366

    accuracy                           0.67      1407
   macro avg       0.60      0.62      0.61      1407
weighted avg       0.70      0.67      0.68      1407



In [215]:
# determining most important features

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_model.score(X_test, y_test)

print(rfc_model.feature_importances_)

# most important ones are MonthlyCharges and TotalCharges
# trying to predict using just those 2

new_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

new_data = new_data[['TotalCharges', 'MonthlyCharges']]


# getting data ready

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

data['Churn'].unique()
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

new_data = new_data.dropna()

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, test_size=.2)

rfc_model_just_2_features = RandomForestClassifier()
rfc_model_just_2_features.fit(X_train, y_train)

print(rfc_model_just_2_features.score(X_test, y_test))

print(cross_val_score(rfc_model_just_2_features, X_test, y_test))

print(classification_report(y_test, y_pred))

[0.01174052 0.01212582 0.00807218 0.00880119 0.05040183 0.01329375
 0.00530273 0.01487543 0.02092747 0.01108117 0.01020761 0.00952398
 0.01034323 0.01199585 0.01062581 0.06520767 0.01024162 0.00540484
 0.00656766 0.00725933 0.00444762 0.00439435 0.00608794 0.00229124
 0.00213226 0.33466419 0.34198271]
0.7803837953091685
[0.76241135 0.71985816 0.77224199 0.74021352 0.79359431]
              precision    recall  f1-score   support

           0       0.73      0.77      0.75      1029
           1       0.25      0.21      0.23       378

    accuracy                           0.62      1407
   macro avg       0.49      0.49      0.49      1407
weighted avg       0.60      0.62      0.61      1407



In [226]:
# undersampling
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler(random_state=42)

X_undersampled, y_undersampled = undersampler.fit_resample(X_train, y_train)

X_underresampled_df = pd.DataFrame(X_undersampled, columns=X_train.columns)
y_underresampled_df = pd.Series(y_undersampled, name="Churn")
print(len(X_underresampled_df))

from sklearn.ensemble import RandomForestClassifier

rfc_model_undersampled = RandomForestClassifier(n_estimators=10, min_samples_split=2, min_samples_leaf=2, max_features='log2', max_depth=5)
rfc_model_undersampled.fit(X_undersampled, y_undersampled)
rfc_model_rsm.score(X_test, y_test)

2982


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- CompanyTenureCategory_Long
- CompanyTenureCategory_Medium
- CompanyTenureCategory_New
- CompanyTenureCategory_Very Long
- ContractPayment_Month-to-month - Bank transfer (automatic)
- ...
Feature names seen at fit time, yet now missing:
- Contract_Month-to-month
- Contract_One year
- Contract_Two year
- Dependents_No
- Dependents_Yes
- ...


In [236]:
# Feature eng.

data = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.sample(frac=1)
data = data.dropna()
# 1. Family status
data['FamilyStatus'] = 'Single'
data.loc[(data['Partner'] == 'Yes') | (data['Dependents'] == 'Yes'), 'FamilyStatus'] = 'Family'

# 2. Company tenure
data['CompanyTenureCategory'] = pd.cut(data['tenure'], bins=[0, 12, 36, 60, data['tenure'].max()], labels=['New', 'Medium', 'Long', 'Very Long'])

# 3. Services customer is using
data['ServicesUsed'] = 'None'
data.loc[(data['InternetService'] != 'No') & (data['PhoneService'] != 'No'), 'ServicesUsed'] = 'Internet and Phone'
data.loc[(data['InternetService'] != 'No') & (data['PhoneService'] == 'No'), 'ServicesUsed'] = 'Internet Only'
data.loc[(data['InternetService'] == 'No') & (data['PhoneService'] != 'No'), 'ServicesUsed'] = 'Phone Only'

# 4. Level of online security
data['SecurityLevel'] = 'Low'
data.loc[(data['OnlineSecurity'] == 'Yes') & (data['TechSupport'] == 'Yes'), 'SecurityLevel'] = 'High'

# 5. Overview of streaming services
data['StreamingActivity'] = 'Not Active'
data.loc[(data['StreamingTV'] == 'Yes') | (data['StreamingMovies'] == 'Yes'), 'StreamingActivity'] = 'Active'

# 6. Contract type
data['ContractPayment'] = data['Contract'] + ' - ' + data['PaymentMethod']

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

data['Churn'].unique()
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

data.isnull().sum()
len(data)
# Total charges is null in 11 cases - removing those cases
data = data.dropna()

X = data.drop(columns=['customerID', 'Churn'])
y = data['Churn']

# encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['gender', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 
        'FamilyStatus', 'CompanyTenureCategory', 'ServicesUsed', 'SecurityLevel',
        'StreamingActivity', 'ContractPayment']
one_hot = OneHotEncoder(sparse_output=False)
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')
transformed_X = transformer.fit_transform(X)

# Saving the names of colums
encoded_feature_names = transformer.named_transformers_['one_hot'].get_feature_names_out(categorical_features)
all_feature_names = list(encoded_feature_names) + [col for col in X.columns if col not in categorical_features]

X_transformed_df = pd.DataFrame(transformed_X, columns=all_feature_names)

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=.2)

from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_model.score(X_test, y_test)


from sklearn.metrics import classification_report

y_preds= rfc_model.predict(X_test)

classification_report_svm = classification_report(y_test, y_preds)

print(classification_report_svm)

              precision    recall  f1-score   support

           0       0.82      0.92      0.86      1032
           1       0.65      0.44      0.52       375

    accuracy                           0.79      1407
   macro avg       0.74      0.68      0.69      1407
weighted avg       0.77      0.79      0.77      1407



In [237]:
print(y_test)

3275    0
746     0
3959    0
3030    1
2894    1
       ..
2321    0
3029    1
2598    0
5329    1
6303    0
Name: Churn, Length: 1407, dtype: int64
