In [106]:
import pandas as pd 
import numpy as np 
from sqlalchemy import create_engine 
import os
from dotenv import load_dotenv
from urllib.parse import quote_plus
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline  import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [107]:
from sklearn import set_config
set_config(display='diagram')

In [108]:
db_members = pd.read_csv("tables/members_v3.csv")
db_train = pd.read_csv("tables/train_v2.csv")
db_transactions = pd.read_csv("tables/transactions_v2.csv")

### Taking a sample data of 50000 rows

### Filtering the rows of test and transaction based on the presence of foreign key 'msno' in members table

In [109]:
common_items = set(db_train['msno']).intersection(set(db_transactions["msno"]))

db_members = db_members[db_members["msno"].isin(common_items)]
db_members = db_members.head(10000)

In [110]:
db_transactions = db_transactions[db_transactions['msno'].isin(db_members['msno'])]
db_train = db_train[db_train['msno'].isin(db_members['msno'])]

In [111]:
zeros = db_train[db_train['is_churn'] == 0]
ones = db_train[db_train['is_churn'] == 1]
print(zeros.shape)
print(ones.shape)

(9119, 2)
(881, 2)


### Performing undersampling of 0's to match the number fo 1's in the is_churn column

In [112]:
# undersampling 0's to match the number of 1's
zeros_undersampled = resample(zeros,replace=False,n_samples=len(ones),random_state=42)
db_train = pd.concat([zeros_undersampled,ones])

# suffling the results
db_train = db_train.sample(frac=1,random_state=42).reset_index(drop=True)

print(ones.count())
print(zeros_undersampled.count())
print(db_train.shape)

msno        881
is_churn    881
dtype: int64
msno        881
is_churn    881
dtype: int64
(1762, 2)


Python script for changing csv to sql script

In [113]:

def generate_sql(database,output_file,table_name):
    output = output_file
    with open(output, "w", encoding="utf-8") as f:
        for index, row in database.iterrows():
            values = ",".join(
                [ f"'{str(x).replace('\'', '\\\'')}'" if pd.notna(x) else 'Null' for x in row]
            )
            f.write(f"INSERT INTO {table_name} VALUES ({values});\n")

In [114]:
generate_sql(db_members, "queries/members.sql", "members")
generate_sql(db_transactions, "queries/transactions.sql", "transactions")
generate_sql(db_train, "queries/train.sql", "train")


### Importing the main view as a database

In [115]:
load_dotenv( dotenv_path= "login.env")

username = os.getenv("mysql_username")
password = os.getenv("mysql_password")
host = os.getenv("mysql_host")
port = os.getenv("mysql_port")
database = os.getenv("mysql_database")
password = quote_plus(password)

In [116]:
engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

### Reading the main view and importing it as database

In [117]:
main_db = pd.read_sql("select * from main_view", con=engine)
main_db.head()

Unnamed: 0,msno,city,gender,registered_via,transaction_date,plan_list_price,actual_amount_paid,is_auto_renew,membership_expire_date,payment_method_id,payment_plan_days,is_churn
0,pSTuOZfsry0JrJHIAKJIzjT44QzYwDEDVzb29p2+/zE=,6,male,7,20170313,129,129,1,20170313,41,30,1
1,gGCGr9HakzejGaLxWw3kP0W9DyWEuEcgtbju8qrTqs4=,15,male,9,20170331,149,149,1,20170519,39,30,0
2,gGCGr9HakzejGaLxWw3kP0W9DyWEuEcgtbju8qrTqs4=,15,male,9,20170228,149,149,1,20170419,39,30,0
3,wqqGqVlWfiBx+XsOEWEKjFqOTPsCH0CxV7WOoKGn0Ms=,4,female,7,20170228,149,149,1,20170430,39,30,0
4,wqqGqVlWfiBx+XsOEWEKjFqOTPsCH0CxV7WOoKGn0Ms=,4,female,7,20170331,149,149,1,20170530,39,30,0


### Checking for null values

In [118]:
print(main_db.isna().sum())

msno                      0
city                      0
gender                    0
registered_via            0
transaction_date          0
plan_list_price           0
actual_amount_paid        0
is_auto_renew             0
membership_expire_date    0
payment_method_id         0
payment_plan_days         0
is_churn                  0
dtype: int64


In [119]:
main_db['transaction_date'] = pd.to_datetime(main_db['transaction_date'],format='%Y%m%d')
main_db['membership_expire_date'] = pd.to_datetime(main_db['membership_expire_date'], format='%Y%m%d')


### Dividing data for creating training, testing and validation dataset

In [120]:
rows, features = main_db.shape
x, y = np.hsplit(main_db,[features-1])

  return bound(*args, **kwds)
  return bound(*args, **kwds)


In [121]:
y = y.squeeze()
y = y.astype(int)

In [122]:
y.dtypes

dtype('int64')

### Dropping column msno since it won't be required

In [123]:
x = x.drop(columns=["msno"])

### Checking if plan_list_price is same as actual_ammount_paid

In [124]:
count = 0
for i in range(0,len(x)):
    if(x.loc[i,"plan_list_price"] == x.loc[i,"actual_amount_paid"]):
        count += 1
print("Number of same values in each row :"+str(count))
print("Size of x:"+str(x.shape))

Number of same values in each row :2733
Size of x:(2782, 10)


### Dropping plan_list_price

In [125]:
x = x.drop(columns=["plan_list_price"])

In [126]:
x.head()

Unnamed: 0,city,gender,registered_via,transaction_date,actual_amount_paid,is_auto_renew,membership_expire_date,payment_method_id,payment_plan_days
0,6,male,7,2017-03-13,129,1,2017-03-13,41,30
1,15,male,9,2017-03-31,149,1,2017-05-19,39,30
2,15,male,9,2017-02-28,149,1,2017-04-19,39,30
3,4,female,7,2017-02-28,149,1,2017-04-30,39,30
4,4,female,7,2017-03-31,149,1,2017-05-30,39,30


In [127]:
x["payment_plan_days"].unique()

array([30, 410, 90, 180, 240, 100, 195, 400, 120, 80, 0, 60, 360, 395, 10,
       200, 365, 1, 7, 450, 270], dtype=object)

### Creating Test Train Split

In [128]:
x_train, x_test, x_valid = np.split(x,[int(0.7*len(x)-1), int(0.9*len(x))-1])
y_train, y_test, y_valid = np.split(y,[int(0.7*len(y)-1), int(0.9*len(y))-1])

  return bound(*args, **kwds)
  return bound(*args, **kwds)


## Data Transformation Pipeline

### 1. Perfroming One Hot encoding on the gender column

In [129]:
gen_encoding = ColumnTransformer([
    ("gender", OneHotEncoder(),[1])
],remainder='passthrough')

### 2. Creating a new column which gives the duration of the subscription, by creating a custom transformer

In [130]:
class durationTransform(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
        
        return self
    
    def transform(self,x):
        
        # chacking if the input is dataframe or a numpy array
        if isinstance(x,pd.DataFrame):
            db = x.copy()
            db.head()
        else:
            db = pd.DataFrame(x,columns=["transaction_date","membership_expire_date"])
            db.head()
        
        # making sure the required input is in datetime format
        db["transaction_date"] = pd.to_datetime(db["transaction_date"])
        db["membership_expire_date"] = pd.to_datetime(db["membership_expire_date"])
        
        result = (db["membership_expire_date"] - db["transaction_date"]).dt.days
        return result.values.reshape(-1,1)
        

In [131]:
subs_time = ColumnTransformer([
    ("duration_in_days", durationTransform(), [4,7])
    ], remainder='passthrough')

## Creating the pipeline

In [132]:
pipe = Pipeline([
    ('gen_encoding', gen_encoding),
    ('subs_time', subs_time)
])
pipe

In [133]:
result_from_pipe = pipe.fit_transform(x_train,y_train)
x_train = pd.DataFrame(result_from_pipe, columns=["duration_of_subscription","female","male","city","registered_via","ammount_paid","is_auto_reniew","payment_methord","payment_plan_duration"])
result_from_test = pipe.fit_transform(x_test,y_test)
x_test = pd.DataFrame(result_from_test, columns=["duration_of_subscription","female","male","city","registered_via","ammount_paid","is_auto_reniew","payment_methord","payment_plan_duration"])
result_from_valid = pipe.fit_transform(x_valid,y_valid)
x_valid = pd.DataFrame(result_from_valid, columns=["duration_of_subscription","female","male","city","registered_via","ammount_paid","is_auto_reniew","payment_methord","payment_plan_duration"])

## Final DataFrame

In [134]:
x_train.head()

Unnamed: 0,duration_of_subscription,female,male,city,registered_via,ammount_paid,is_auto_reniew,payment_methord,payment_plan_duration
0,0,0.0,1.0,6,7,129,1,41,30
1,49,0.0,1.0,15,9,149,1,39,30
2,50,0.0,1.0,15,9,149,1,39,30
3,61,1.0,0.0,4,7,149,1,39,30
4,60,1.0,0.0,4,7,149,1,39,30


## Training Models and calculating the accuracy

### 1. Logistic regression model

In [135]:
lg = LogisticRegression()
lg.fit(x_train,y_train)
y_pred = lg.predict(x_test)
score = accuracy_score(y_pred,y_test)
print("Accuracy for Logistic Regression: "+str(round((score*100),2))+"%")

Accuracy for Logistic Regression: 66.91%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 2. Decision Tree model

In [136]:
dst = tree.DecisionTreeClassifier()
dst.fit(x_train,y_train)
y_pred = dst.predict(x_test)
score = accuracy_score(y_pred,y_test)
print("Accuracy for Descision Trees: "+str(round((score*100),2))+"%")

Accuracy for Descision Trees: 80.94%


### 3. Random forest model

In [137]:
rf = RandomForestClassifier(max_depth=10,random_state=1)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
score = accuracy_score(y_pred,y_test)
print("Accuracy for Random Forest: "+str(round((score*100),2))+"%")

Accuracy for Random Forest: 85.79%


### 4. Support Vector Machines

In [138]:
svm = SVC()
svm.fit(x_train,y_train)
y_pred = svm.predict(x_test)
score = accuracy_score(y_pred,y_test)
print("Accuracy for SVM: "+str(round((score*100),2))+"%")

Accuracy for SVM: 67.81%


### 5. K-Nearest Neighbours

In [139]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
score = accuracy_score(y_pred,y_test)
print("Accuracy for K-Nearest Neighbour: "+str(round((score*100),2))+"%")

Accuracy for K-Nearest Neighbour: 83.09%


## Using Joblib for saving the Random Forest Model using JobLib

In [140]:
joblib.dump(rf,"model/model.joblib")

['model/model.joblib']