# LOADING NECESSARY LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing train and test dataset

In [2]:
train = pd.read_csv("../input/customer-churn-prediction-2020/train.csv")

In [3]:
test = pd.read_csv("../input/customer-churn-prediction-2020/test.csv")

In [4]:
train.head()

In [5]:
train.describe()

In [6]:
train.isnull().sum()

In [7]:
train.info()

In [8]:
train.select_dtypes(include = 'object').columns

In [9]:
train.select_dtypes(include = 'object').head()

In [10]:
train.area_code.unique()

In [11]:
test.area_code.unique()

HANDLING CATEGORICAL VARIABLES

In [12]:
train.area_code = train.area_code.map({'area_code_415':415,'area_code_408':408,'area_code_510':510})
test.area_code = test.area_code.map({'area_code_415':415,'area_code_408':408,'area_code_510':510})

In [13]:
train = train.replace({'voice_mail_plan':{'yes':1,'no':0}})
test = test.replace({'voice_mail_plan':{'yes':1,'no':0}})

In [14]:
train = train.replace({'international_plan':{'yes':1,'no':0}})
test = test.replace({'international_plan':{'yes':1,'no':0}})

In [15]:
train = train.replace({'churn':{'yes':1,"no":0}})
test = test.replace({'churn':{'yes':1,"no":0}})

In [16]:
train.state = train.state.astype('category')
test.state = test.state.astype('category')

In [17]:
train.info()

In [18]:
train.head()

GRAPHICAL ANALYSIS

In [19]:
sns.countplot(x = 'churn',data = train)

In [20]:
fig,ax = plt.subplots(figsize = (18,10)) 
x = train.groupby('state')[['churn']].size()
x.plot.bar()

In [21]:
train.groupby('churn')['number_customer_service_calls'].count()

In [22]:
def facet(data,y):
    g = sns.FacetGrid(data,col = 'churn',size = 5)
    g.map(plt.hist,y,alpha = 0.5)
    g.add_legend()
    plt.show()

In [23]:
facet(train,'number_customer_service_calls')

In [24]:
facet(train,'total_day_calls')

In [25]:
facet(train,'total_eve_calls')

In [26]:
facet(train,'number_vmail_messages')

In [27]:
facet(train,'voice_mail_plan')

## Checking Correlation between different variables

In [28]:
corr_matrix = train.corr()
corr_matrix

In [29]:
plt.figure(figsize = (10,10))
sns.heatmap(corr_matrix,data = train)

## Deleting the variables which has high correlation with other variables

In [30]:
col_drop = ['total_day_minutes','total_night_minutes','total_eve_minutes','total_intl_minutes']


In [31]:
train = train.drop(columns = col_drop,axis = 1)
test = test.drop(columns = col_drop,axis = 1)

In [32]:
train.head()

In [33]:
corr_matrix = train.corr()
corr_matrix
plt.figure(figsize = (10,10))
sns.heatmap(corr_matrix,data = train)

## Building a churn prediction model

In [34]:
X = train.drop(['state','churn'],axis = 1)
y = train['churn']

In [35]:
X.columns

In [36]:
train.columns

## Feature Scaling

In [37]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [38]:
X_std = scaler.fit_transform(X)

In [39]:
type(X_std)

In [40]:
df = pd.DataFrame(X_std, index = train.index, columns = train.columns[1:15])

In [41]:
df['state'] = train['state']
df['churn'] = train['churn']

In [42]:
df.head()

In [43]:
X_new = df.drop(['state','churn'],axis = 1)
y_new = df['churn']

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X_train,X_test,y_train,y_test = train_test_split(X_new,y_new,test_size = 0.3,random_state = 42)

In [46]:
X_train.shape

In [47]:
y_train.shape

### Random Forest Regressor Model

In [48]:
from sklearn.ensemble import RandomForestRegressor
for_reg = RandomForestRegressor(random_state = 42)
for_reg.fit(X_train,y_train)

In [49]:
predict = for_reg.predict(X_test)

In [50]:
from sklearn.metrics import mean_squared_error,accuracy_score

In [51]:
predict

In [52]:
accuracy_score(predict.round(),y_test)

In [53]:
mse = mean_squared_error(predict.round(),y_test)

In [54]:
rmse = np.sqrt(mse)
rmse

## Fine-Tune Model

In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
param_grid = [{'n_estimators':[10,100,1000] ,'max_features':[2,4,6,8,16]}]              

In [57]:
param_grid

In [58]:
grid_search = GridSearchCV(for_reg,param_grid,cv = 3,scoring = 'neg_mean_squared_error',return_train_score = True,n_jobs = 3)

In [59]:
grid_search.fit(X_train,y_train)

In [60]:
grid_search.best_params_

In [61]:
grid_search.best_estimator_

In [62]:
y_pred = grid_search.predict(X_test)

In [63]:
y_pred = y_pred.round()

In [64]:
score = accuracy_score(y_pred,y_test)

In [65]:
print( "Accuracys is"+" "+ str(score*100),"%")

## Creating Test Dataset

In [66]:
test

In [67]:
X_new_test = test.drop(['id','state'],axis = 1)

In [68]:
X_test_std = scaler.fit_transform(X_new_test)

In [69]:
df_test = pd.DataFrame(X_test_std,columns = test.columns[2:])

In [70]:
df_test

In [71]:
predict_test = grid_search.predict(df_test)

In [72]:
pred_value_1 = predict_test.round()

In [73]:
pred_value_1 = np.where(pred_value_1 == 0,'no','yes')

In [74]:
pred_value_1

In [75]:
result = pd.DataFrame({'id':test.id,"churn":pred_value_1})

In [76]:
result

In [78]:
result.to_csv('churn_2.csv',index = False)

In [79]:
kaggle competitions submit -c customer-churn-prediction-2020 -f churn_2.csv -m "Message"