# Logistic Regression

Use the dataset, perform necessary pre-processing and build a logistic regression model. divide the train data itself into 70-30 ratio and print the performance metrics

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
df = pd.read_csv("telecom_customer_churn.csv")

In [5]:
df

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.60,593.30,0.00,0,381.51,974.81,1,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,1,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,0,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,0,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.90,267.40,0.00,0,22.14,289.54,0,Dissatisfaction,Network reliability
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,9986-BONCE,Female,36,No,0,Fallbrook,92028,33.362575,-117.299644,0,...,Bank Withdrawal,20.95,85.50,0.00,0,8.04,93.54,0,Competitor,Competitor made better offer
6585,9987-LUTYD,Female,20,No,0,La Mesa,91941,32.759327,-116.997260,0,...,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,1,,
6586,9992-RRAMN,Male,40,Yes,0,Riverbank,95367,37.734971,-120.954271,1,...,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,0,Dissatisfaction,Product dissatisfaction
6587,9993-LHIEB,Male,21,Yes,0,Solana Beach,92075,33.001813,-117.263628,5,...,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,1,,


In [6]:
df = df.drop(columns=["Customer ID", "City", "Zip Code","Longitude","Latitude", "Churn Category","Churn Reason"])

In [7]:
df.isnull().sum()

Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3598
Phone Service                           0
Avg Monthly Long Distance Charges     644
Multiple Lines                        644
Internet Service                        0
Internet Type                        1344
Avg Monthly GB Download              1344
Online Security                      1344
Online Backup                        1344
Device Protection Plan               1344
Premium Tech Support                 1344
Streaming TV                         1344
Streaming Movies                     1344
Streaming Music                      1344
Unlimited Data                       1344
Contract                                0
Paperless Billing                       0
Payment Method                    

In [8]:
df["Avg Monthly Long Distance Charges"] = df["Avg Monthly Long Distance Charges"].fillna(0)

In [9]:
df["Avg Monthly GB Download"] = df["Avg Monthly GB Download"].fillna(0)

In [10]:
df["Multiple Lines"] = df["Multiple Lines"].fillna("No Phone")

In [11]:
df["Internet Type"] = df["Internet Type"].fillna("None")

In [12]:
na_cols = ["Online Security", "Online Backup", "Device Protection Plan", "Premium Tech Support", "Streaming TV", "Streaming Movies", "Streaming Music", "Unlimited Data"]
for column in na_cols:
  df[column] = df[column].fillna("No")

In [13]:
df.isnull().sum()

Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3598
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                        0
Internet Type                           0
Avg Monthly GB Download                 0
Online Security                         0
Online Backup                           0
Device Protection Plan                  0
Premium Tech Support                    0
Streaming TV                            0
Streaming Movies                        0
Streaming Music                         0
Unlimited Data                          0
Contract                                0
Paperless Billing                       0
Payment Method                    

In [14]:
df.shape

(6589, 31)

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()

for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])

In [17]:
df['Gender'] = [1 if each == 'Female' else 0 for each in df['Gender']]

In [18]:
def encode_data(dataframe):
    if dataframe.dtype == "object":
        dataframe = LabelEncoder().fit_transform(dataframe)
    return dataframe

data = df.apply(lambda x: encode_data(x))
data.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,1,37,1,0,2,9,5,1,42.39,0,...,1,1,1,65.6,593.3,0.0,0,381.51,974.81,1
1,0,46,0,0,0,9,5,1,10.69,2,...,0,0,1,-4.0,542.4,38.33,10,96.21,610.28,1
2,0,50,0,0,0,4,4,1,33.65,0,...,0,1,0,73.9,280.85,0.0,0,134.6,415.45,0
3,0,78,1,0,1,13,3,1,27.82,0,...,0,1,0,98.0,1237.85,0.0,0,361.66,1599.51,0
4,1,75,1,0,3,3,5,1,7.38,0,...,0,1,1,83.9,267.4,0.0,0,22.14,289.54,0


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6589 entries, 0 to 6588
Data columns (total 31 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             6589 non-null   int64  
 1   Age                                6589 non-null   int64  
 2   Married                            6589 non-null   int64  
 3   Number of Dependents               6589 non-null   int64  
 4   Number of Referrals                6589 non-null   int64  
 5   Tenure in Months                   6589 non-null   int64  
 6   Offer                              6589 non-null   int64  
 7   Phone Service                      6589 non-null   int64  
 8   Avg Monthly Long Distance Charges  6589 non-null   float64
 9   Multiple Lines                     6589 non-null   int64  
 10  Internet Service                   6589 non-null   int64  
 11  Internet Type                      6589 non-null   int64

In [20]:
X = data.drop(['Customer Status'], axis=1)
y = data.loc[:, 'Customer Status'].values

In [21]:
X["intercept"] = 1

In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [23]:
vif = pd.DataFrame()
vif["variable"] = X.columns

In [24]:
vif["vif"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

  vif = 1. / (1. - r_squared_i)


In [25]:
vif

Unnamed: 0,variable,vif
0,Gender,1.003389
1,Age,1.662891
2,Married,1.963668
3,Number of Dependents,1.242283
4,Number of Referrals,1.859219
5,Tenure in Months,8.67063
6,Offer,1.184711
7,Phone Service,2.122874
8,Avg Monthly Long Distance Charges,3.48858
9,Multiple Lines,1.356865


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
X_train.shape

(4612, 31)

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
cols = ["Tenure in Months","Avg Monthly Long Distance Charges","Avg Monthly GB Download","Monthly Charge","Total Charges","Total Refunds","Total Extra Data Charges","Total Long Distance Charges"]

In [30]:
scaler = StandardScaler()
X_train[cols] = scaler.fit_transform(X_train[cols])
X_test[cols] = scaler.fit_transform(X_test[cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[cols] = scaler.fit_transform(X_train[cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[cols] = scaler.fit_transform(X_test[cols])


In [31]:
model = LogisticRegression()

In [32]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [33]:
y_pred = model.predict(X_test)

In [34]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8199291856348002
Precision: 0.8662020905923344
Recall: 0.8834399431414357
F1-score: 0.8747361013370865
