In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

#creating a dataframe with the data and looking at the first 5 rows
df = pd.read_csv('data2/telco_churn.csv')
df.head(5)

#looking at shape of dataset it is 100,000 rows by 100 columns
df.shape

#describing the dataset 
df.describe()

#type of data in each column 
df.info()

#dropping Customer ID as it will not determine the probability of a customer churning
df.drop('Customer_ID', axis = 1, inplace = True)
df.info()


#seperating into 3 groups: float, int and object for visualization
df_categorical = [col for col in df.columns if df[col].dtype == 'object']
df_integer = [col for col in df.columns if df[col].dtype == 'int64']
df_float = [col for col in df.columns if df[col].dtype == 'float64']  



In [None]:
# Checking for null entries
print(df.isnull().sum())

#columns with missing values 
null_val = df.isnull().sum().sort_values(ascending = False).head(11)
percent = (null_val/len(df))*100




# Percentage of missing values
pd.DataFrame({'Number of missing values': null_val, 'Percentage of missing data': percent})


In [None]:
#visualizing the features with the most missing values against the churn 
sns.countplot(x= "numbcars", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('numbcars')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "dwllsize", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('dwllsize')["churn"].value_counts(normalize=True).unstack(fill_value=0)


sns.countplot(x= "HHstatin", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('HHstatin')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "ownrent", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('ownrent')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "lor", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('lor')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "income", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('income')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "adults", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('adults')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "infobase", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('infobase')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "hnd_webcap", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('hnd_webcap')["churn"].value_counts(normalize=True).unstack(fill_value=0)

sns.countplot(x= "prizm_social_one", hue="churn", data=df);
plt.xticks()
plt.show()
df.groupby('prizm_social_one')["churn"].value_counts(normalize=True).unstack(fill_value=0)


In [None]:
plt.figure(figsize=(20, 5))
sns.heatmap(df.isnull())
plt.show

In [None]:
df.drop(['numbcars','dwllsize','HHstatin','ownrent','dwlltype','lor',
         'income','adults','infobase','prizm_social_one',],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
#plotting integer values into a histogram
int_hist = df[['months', 'uniqsubs','actvsubs','totcalls','adjqty','avg3mou','avg3qty','avg3rev']]

fig1 = plt.figure(1, figsize=(15, 10))
plt.suptitle('Histograms for columns with data type as integer\n')

for i in range(int_hist.shape[1]):
    plt.subplot(6, 3, i + 1)
    f = plt.gca()
    f.set_title(int_hist.columns.values[i])
    vals = np.size(int_hist.iloc[:, i].unique())
    if vals >= 100:
        vals = 100
    plt.hist(int_hist.iloc[:, i], bins=vals, color = '#0000BB')

plt.tight_layout(rect=[0, 0.05, 1, 0.95])  

fig1 = plt.figure(1, figsize=(15, 10))
plt.suptitle('Histograms for columns with data type as integer\n')

for i in range(int_hist.shape[1]):
    plt.subplot(6, 3, i + 1)
    f = plt.gca()
    f.set_title(int_hist.columns.values[i])
    vals = np.size(int_hist.iloc[:, i].unique())
    if vals >= 100:
        vals = 100
    plt.hist(int_hist.iloc[:, i], bins=vals, color = '#0000BB')

plt.tight_layout(rect=[0, 0.05, 1, 0.95])



In [None]:
def find_outliers(df,columns):
    outliers = []
    
    for i in columns:
        # upper limit 
        ucl_val = df[i].mean() + (3 * df[i].std())
        
        #lower limit 
        lcl_val = df[i].mean() - (3 * df[i].std())
        
        df[(df[i]>ucl_val) | (df[i]<lcl_val)][i]

    return outliers