In [None]:
import pandas as pd
import numpy as np                     
import seaborn as sns                  
import matplotlib.pyplot as plt 
import seaborn as sn                   
%matplotlib inline
import warnings                        
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.shape, test.shape

In [None]:
train.dtypes

In [None]:
train.head()

## Univariate Analysis

In [None]:
train['subscribed'].value_counts()

In [None]:
# Normalize can be set to True to print proportions instead of number 
train['subscribed'].value_counts(normalize=True)

In [None]:
# plotting the bar plot of frequencies
train['subscribed'].value_counts().plot.bar()

In [None]:
sn.distplot(train["age"])

In [None]:
train['job'].value_counts().plot.bar()

In [None]:
train['default'].value_counts().plot.bar()

## Bivariate Analysis

In [None]:
print(pd.crosstab(train['job'],train['subscribed']))

job=pd.crosstab(train['job'],train['subscribed'])
job.div(job.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(8,8))
plt.xlabel('Job')
plt.ylabel('Percentage')

In [None]:
print(pd.crosstab(train['default'],train['subscribed']))

default=pd.crosstab(train['default'],train['subscribed'])
default.div(default.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(8,8))
plt.xlabel('default')
plt.ylabel('Percentage')

In [None]:
train['subscribed'].replace('no', 0,inplace=True)
train['subscribed'].replace('yes', 1,inplace=True)

In [None]:
corr = train.corr()
mask = np.array(corr)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sn.heatmap(corr, mask=mask,vmax=.9, square=True,annot=True, cmap="YlGnBu")

In [None]:
train.isnull().sum()

## Model Building

In [None]:
target = train['subscribed']
train = train.drop('subscribed',1)

In [None]:
# applying dummies on the train dataset
train = pd.get_dummies(train)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# splitting into train and validation with 20% data in validation set and 80% data in train set.
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state=12)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# defining the logistic regression model
lreg = LogisticRegression()

In [None]:
# fitting the model on  X_train and y_train
lreg.fit(X_train,y_train)

In [None]:
# making prediction on the validation set
prediction = lreg.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# calculating the accuracy score
accuracy_score(y_val, prediction)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# defining the decision tree model with depth of 4, you can tune it further to improve the accuracy score
clf = DecisionTreeClassifier(max_depth=4, random_state=0)

In [None]:
# fitting the decision tree model
clf.fit(X_train,y_train)

In [None]:
# making prediction on the validation set
predict = clf.predict(X_val)

In [None]:
# calculating the accuracy score
accuracy_score(y_val, predict)

In [None]:
test = pd.get_dummies(test)

In [None]:
test_prediction = clf.predict(test)

In [None]:
submission = pd.DataFrame()

In [None]:
# creating a Business_Sourced column and saving the predictions in it
submission['ID'] = test['ID']
submission['subscribed'] = test_prediction

In [None]:
submission['subscribed'].replace(0,'no',inplace=True)
submission['subscribed'].replace(1,'yes',inplace=True)

In [None]:
submission.to_csv('submission.csv', header=True, index=False)