In [69]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score

In [70]:
train=pd.read_csv('train_s3TEQDk.csv')
test=pd.read_csv('test_mSzZ8RL.csv')

In [71]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [72]:
train['Credit_Product'].value_counts()

No     144357
Yes     72043
Name: Credit_Product, dtype: int64

In [73]:
test['Credit_Product']=test['Credit_Product'].fillna(test['Credit_Product'].mode()[0])

__Model Building:__

In [74]:
# Labelencoding
le = LabelEncoder()
var_mod = train.select_dtypes(include='object').columns
for i in var_mod:
    train[i] = le.fit_transform(train[i])
    
for i in var_mod:
    test[i] = le.fit_transform(test[i])

In [75]:
# Seperate Features and Target
y= train['Is_Lead']
X= train.drop(columns = ['Is_Lead','ID','Region_Code'], axis=1)
test=test.drop(columns=['ID','Region_Code'],axis=1)

In [76]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,stratify=y)

In [77]:
algos = [LogisticRegression(),  Ridge(alpha=0.2), Lasso(alpha=0.001),
          KNeighborsClassifier(), DecisionTreeClassifier(),AdaBoostClassifier()]
names = ['Logistic Regression', 'Ridge Regression', 'Lasso Regression',
         'K Neighbors Regressor', 'Decision Tree Regressor','AdaboostClassifier']

rocauc_list=[]
rocauc_train=[]

In [78]:
for name in algos:
    model = name
    model.fit(X_train,y_train)
    y_pred = model.predict(X_valid)
    y_pred_train=model.predict(X_train)
    rocauc= metrics.roc_auc_score(y_valid,y_pred)
    rocaucfortrain=metrics.roc_auc_score(y_train,y_pred_train)
    rocauc_list.append(rocauc)
    rocauc_train.append(rocaucfortrain)

In [79]:
evaluation = pd.DataFrame({'Model': names,
                           'roc_auc': rocauc_list,'rocauc_for_train': rocauc_train})

In [80]:
evaluation

Unnamed: 0,Model,roc_auc,rocauc_for_train
0,Logistic Regression,0.5,0.5
1,Ridge Regression,0.85179,0.852406
2,Lasso Regression,0.851792,0.852553
3,K Neighbors Regressor,0.530995,0.63663
4,Decision Tree Regressor,0.709574,0.999936
5,AdaboostClassifier,0.733677,0.734056


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV

lassocv = LassoCV(alphas= [0.01,0.03,0.001,0.1,0.2,0.02,0.002],cv =5)
lassocv.fit(X_train, y_train)

LassoCV(alphas=[0.01, 0.03, 0.001, 0.1, 0.2, 0.02, 0.002], cv=5)

In [14]:
alpha = lassocv.alpha_
alpha

0.001

In [15]:
# Model Building
features= X.columns
lasso = Lasso(alpha=0.001)
lasso.fit(X_train,y_train)
y_pred = lasso.predict(X_valid)
coef = pd.Series(lasso.coef_,features).sort_values()

In [16]:
print('The coefficients:','\n',coef)

The coefficients: 
 Occupation            -1.851041e-03
Age                   -2.517641e-04
Avg_Account_Balance   -4.769524e-09
Vintage                1.575969e-03
Gender                 1.988278e-03
Channel_Code           9.088812e-03
Is_Active              5.243362e-02
Credit_Product         3.243253e-01
dtype: float64


In [26]:
train_preds = lasso.predict(X_train)
test_preds = lasso.predict(X_valid)
train_auc = roc_auc_score(y_train, train_preds)
test_auc = roc_auc_score(y_valid, test_preds)

In [29]:
print('Test set auc:', test_auc, 'and', 'Train set auc:', train_auc)

Test set auc: 0.8528730767456352 and Train set auc: 0.8522232660010982


In [21]:
submission = pd.read_csv('sample_submission_eyYijxG.csv')
final_predictions = lasso.predict(test)
submission['Is_Lead'] = final_predictions
submission['Is_Lead'] = submission['Is_Lead'].apply(lambda x: 0 if x<0 else x)
submission.to_csv('Lasso_submission1.csv', index=False)

In [23]:
submission['Is_Lead'].head()

0    0.334329
1    0.052518
2    0.000000
3    0.022012
4    0.000000
Name: Is_Lead, dtype: float64