In [1]:
import pandas as pd                   
import numpy as np                   
import matplotlib.pyplot as plt       
import seaborn as sns               
import statsmodels.api as sm         
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [2]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
dtypes = churnData.dtypes
print(dtypes)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [4]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [5]:
dtypes = churnData.dtypes
print(dtypes)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [6]:
null_values = churnData.isnull()
print(null_values)

      gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0      False          False    False       False   False         False   
1      False          False    False       False   False         False   
2      False          False    False       False   False         False   
3      False          False    False       False   False         False   
4      False          False    False       False   False         False   
...      ...            ...      ...         ...     ...           ...   
7038   False          False    False       False   False         False   
7039   False          False    False       False   False         False   
7040   False          False    False       False   False         False   
7041   False          False    False       False   False         False   
7042   False          False    False       False   False         False   

      OnlineSecurity  OnlineBackup  DeviceProtection  TechSupport  \
0              False         False        

In [7]:
churnData_filled = churnData.fillna(0)
print(churnData_filled)

      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0     Female              0     Yes         No       1           No   
1       Male              0      No         No      34          Yes   
2       Male              0      No         No       2          Yes   
3       Male              0      No         No      45           No   
4     Female              0      No         No       2          Yes   
...      ...            ...     ...        ...     ...          ...   
7038    Male              0     Yes        Yes      24          Yes   
7039  Female              0     Yes        Yes      72          Yes   
7040  Female              0     Yes        Yes      11           No   
7041    Male              1     Yes         No       4          Yes   
7042    Male              0      No         No      66          Yes   

     OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV  \
0                No          Yes               No          No         

In [8]:
# Feature selection and extraction 
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
target = 'Churn'

X = churnData[features]
y = churnData[target]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Perform logistic regression 
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Check for accuracy 
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Impute missing values with mean
churnData = churnData.fillna(churnData.mean())

# Scale the features using StandardScaler
scaler = StandardScaler()
churnData_scaled = scaler.fit_transform(churnData)

In [None]:
print(churnData['Churn'].value_counts())

In [None]:
# upsampling and downsampling using the resample() and sample() functions

In [9]:
# Check class imbalance
class_counts = churnData['Churn'].value_counts()
print("Class Counts:")
print(class_counts)

# Separate the majority and minority classes
majority_class = churnData[churnData['Churn'] == 'majority']
minority_class = churnData[churnData['Churn'] == 'minority']

# Upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Downsampling the majority class
majority_downsampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the upsampled minority class with the majority class
churnData_balanced = pd.concat([majority_downsampled, minority_upsampled])

# Separate the features and target variable from the balanced dataset
X = churnData_balanced.drop('Churn', axis=1)
y = churnData_balanced['Churn']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a logistic regression model on the training data
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Check the accuracy on the test data
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Class Counts:
No     5174
Yes    1869
Name: Churn, dtype: int64


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.