In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Introduction

## Case: Comparing Predictions by Linear and Logistic Regression on the Shill Bidding Dataset

Consider the Shill_Bidding_Dataset.csv dataset, which contains details regarding auctions done for various products on eBay.com. The target column, Class, provides information about the bidding behavior, 0 being normal and 1 being abnormal behavior. Abnormal behavior can be similar to malicious clicks or automatic bidding. This time compare the accuracy of various model

In [2]:
data = pd.read_csv('./dataset/Shill_Bidding_Dataset.csv')
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6321 entries, 0 to 6320
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Record_ID               6321 non-null   int64  
 1   Auction_ID              6321 non-null   int64  
 2   Bidder_ID               6321 non-null   object 
 3   Bidder_Tendency         6321 non-null   float64
 4   Bidding_Ratio           6321 non-null   float64
 5   Successive_Outbidding   6321 non-null   float64
 6   Last_Bidding            6321 non-null   float64
 7   Auction_Bids            6321 non-null   float64
 8   Starting_Price_Average  6321 non-null   float64
 9   Early_Bidding           6321 non-null   float64
 10  Winning_Ratio           6321 non-null   float64
 11  Auction_Duration        6321 non-null   int64  
 12  Class                   6321 non-null   int64  
dtypes: float64(8), int64(4), object(1)
memory usage: 642.1+ KB
None


Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,4,732,7***n,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,5,900,z***z,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


In [3]:
# Remove columns that irrelevent to the study case

data.drop(['Record_ID', 'Auction_ID', 'Bidder_ID'], axis=1, inplace=True)
data.head()

Unnamed: 0,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


In [4]:
from sklearn.model_selection import train_test_split
# Create train and test dataset

X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=1, stratify=y)
print(f"Training dataset size: {X_train.shape}, Testing dataset size: {X_test.shape}")

Training dataset size: (4424, 9), Testing dataset size: (1897, 9)


In [5]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# SVM
clf_svm = SVC(kernel='linear', C=1)
clf_svm.fit(X_train, y_train)

# Tree
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, y_train)

# RandomForestClassifier
clf_rfc = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 min_samples_split=7, random_state=0)
clf_rfc.fit(X_train, y_train)

In [10]:
pd.DataFrame({
    'Support Vector Machine': [clf_svm.score(X_test, y_test) * 100],
    'Decision Tree Classifier': [clf_tree.score(X_test, y_test) * 100],
    'Random Forest Classifier': [clf_rfc.score(X_test, y_test) * 100],
}).T.rename(columns={0: 'Accuracy Score'})

Unnamed: 0,Accuracy Score
Support Vector Machine,98.154982
Decision Tree Classifier,99.89457
Random Forest Classifier,99.209278


## Case: Implementing Different Classification Algorithms

In [15]:
data = pd.read_csv('./dataset/Telco_Churn_Data.csv')
data.columns = [c.replace(' ', '_') for c in data.columns]
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4708 entries, 0 to 4707
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Target_Churn                  4708 non-null   object 
 1   Target_Code                   4708 non-null   int64  
 2   Current_Bill_Amt              4708 non-null   int64  
 3   Avg_Calls                     4708 non-null   float64
 4   Avg_Calls_Weekdays            4708 non-null   float64
 5   Account_Age                   4708 non-null   int64  
 6   Percent_Increase_MOM          4708 non-null   float64
 7   Acct_Plan_Subtype             4708 non-null   object 
 8   Complaint_Code                4701 non-null   object 
 9   Avg_Days_Delinquent           4708 non-null   float64
 10  Current_TechSupComplaints     4708 non-null   int64  
 11  Current_Days_OpenWorkOrders   4708 non-null   float64
 12  Equipment_Age                 4708 non-null   int64  
 13  Con

Unnamed: 0,Target_Churn,Target_Code,Current_Bill_Amt,Avg_Calls,Avg_Calls_Weekdays,Account_Age,Percent_Increase_MOM,Acct_Plan_Subtype,Complaint_Code,Avg_Days_Delinquent,Current_TechSupComplaints,Current_Days_OpenWorkOrders,Equipment_Age,Condition_of_Current_Handset,Avg_Hours_WorkOrderOpenned
0,No Churn,0,14210,17950.0,30297.0,24,-0.334193,Gold,Billing Problem,6.2,0,0.0,8,1.0,0.0
1,Churn,1,14407,0.0,0.0,28,0.0,Silver,Moving,1.0,0,0.0,17,1.0,0.0
2,Churn,1,12712,204.666667,10393.6667,23,0.0,Gold,Billing Problem,17.6,0,0.0,23,1.0,0.0
3,No Churn,0,13807,15490.3333,41256.3333,39,0.148986,Silver,Billing Problem,0.0,0,0.0,17,1.0,0.0
4,No Churn,0,3805,5075.0,12333.3333,23,-0.686047,Gold,Billing Problem,3.8,0,0.0,10,1.0,0.0


In [16]:
data['Target_Code'] = data['Target_Code'].astype('object')
data['Condition_of_Current_Handset'] = data['Condition_of_Current_Handset'].astype('object')
data['Current_TechSupComplaints'] = data['Current_TechSupComplaints'].astype('object')

In [25]:
mode_complaint_code = data['Complaint_Code'].mode()[0]
data['Complaint_Code'] = data['Complaint_Code']\
                            .fillna(mode_complaint_code)
mode_current_headset = data['Condition_of_Current_Handset'].mode()[0]
data['Condition_of_Current_Handset'] = data['Condition_of_Current_Handset']\
                                        .fillna(mode_current_headset)
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4708 entries, 0 to 4707
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Target_Churn                  4708 non-null   object 
 1   Target_Code                   4708 non-null   object 
 2   Current_Bill_Amt              4708 non-null   int64  
 3   Avg_Calls                     4708 non-null   float64
 4   Avg_Calls_Weekdays            4708 non-null   float64
 5   Account_Age                   4708 non-null   int64  
 6   Percent_Increase_MOM          4708 non-null   float64
 7   Acct_Plan_Subtype             4708 non-null   object 
 8   Complaint_Code                4708 non-null   object 
 9   Avg_Days_Delinquent           4708 non-null   float64
 10  Current_TechSupComplaints     4708 non-null   object 
 11  Current_Days_OpenWorkOrders   4708 non-null   float64
 12  Equipment_Age                 4708 non-null   int64  
 13  Con

Unnamed: 0,Target_Churn,Target_Code,Current_Bill_Amt,Avg_Calls,Avg_Calls_Weekdays,Account_Age,Percent_Increase_MOM,Acct_Plan_Subtype,Complaint_Code,Avg_Days_Delinquent,Current_TechSupComplaints,Current_Days_OpenWorkOrders,Equipment_Age,Condition_of_Current_Handset,Avg_Hours_WorkOrderOpenned
0,No Churn,0,14210,17950.0,30297.0,24,-0.334193,Gold,Billing Problem,6.2,0,0.0,8,1.0,0.0
1,Churn,1,14407,0.0,0.0,28,0.0,Silver,Moving,1.0,0,0.0,17,1.0,0.0
2,Churn,1,12712,204.666667,10393.6667,23,0.0,Gold,Billing Problem,17.6,0,0.0,23,1.0,0.0
3,No Churn,0,13807,15490.3333,41256.3333,39,0.148986,Silver,Billing Problem,0.0,0,0.0,17,1.0,0.0
4,No Churn,0,3805,5075.0,12333.3333,23,-0.686047,Gold,Billing Problem,3.8,0,0.0,10,1.0,0.0


In [30]:
data["Acct_Plan_Subtype"] = data["Acct_Plan_Subtype"]\
                                .astype('category').cat.codes
data["Complaint_Code"] = data["Complaint_Code"]\
                            .astype('category').cat.codes

data[['Acct_Plan_Subtype', 'Complaint_Code']].head()

Unnamed: 0,Acct_Plan_Subtype,Complaint_Code
0,0,0
1,1,4
2,0,0
3,1,0
4,0,0


In [35]:
from sklearn.model_selection import train_test_split

target = "Target_Code"
top7_features = ['Avg_Days_Delinquent','Percent_Increase_MOM',\
                 'Avg_Calls_Weekdays','Current_Bill_Amt',\
                 'Avg_Calls','Complaint_Code','Account_Age']
X = data[top7_features]
y = data[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                    random_state=123, stratify=y)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
print("Logistic Regression")
clf_logreg = LogisticRegression(random_state=0, solver='lbfgs')
clf_logreg.fit(X_train, y_train)

# Support Vector Model
print("Support Vector Model")
clf_svm = SVC(kernel='linear', C=1)
clf_svm.fit(X_train, y_train)

# Decision Tree Classifier
print("Decision Tree Classifier")
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, y_train)

# Random Forest Classifier
print("Random Forest Classifier")
clf_rfc = RandomForestClassifier(n_estimators=20, max_depth=None,
                                 min_samples_split=7, random_state=0)
clf_rfc.fit(X_train, y_train)
