# Working notebook 1st draft $Telco Project Data$

### Imports

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import wrangle as w

# to see all columns in wide datasets
pd.set_option('display.max_columns', None)


In [None]:
# acquire telco data 
df = w.get_telco_data()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.shape


In [None]:
df.total_charges.value_counts()

# prepare

In [None]:
# cleaning data
df = w.prep_telco(df)

In [None]:
df


df[df.contract_type != 'Month-to-month']

In [None]:
df.customer_id

In [None]:
df.churn

In [None]:
df.shape

In [None]:
def split_telco_data(df):
    '''
    This function split telco data into train , validate, test and  stratifies on churn.
    The split is 20% test 80% train/validate. Then 30% of 80% validate and 70% of 80% train.
    Aproximately (train 56%, validate 24%, test 20%)
    Returns train, validate, and test 
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.churn)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.churn)
    return train, validate, test

In [None]:
# split data
train, validate, test = w.split_telco_data(df)
train.shape,validate.shape, test.shape



In [None]:
df_predictions = pd.DataFrame(test.customer_id)

In [None]:
df_predictions

In [None]:
train.churn

# Explore

# How often does churn occur?

In [None]:
mean = train.churn.value_counts().mean()
mean

In [None]:
train.churn.value_counts()

In [None]:
def get_churn_mean_bar1(df): 
    mean = df.churn.value_counts().mean()
    plt.title(f'Churn Mean : {mean}')
    sns.countplot(df.churn)
    col_mean= df.churn.value_counts().mean()
    plt.axhline(col_mean, label = 'Churn mean',color='maroon')


In [None]:
def get_churn_mean_bar(df):
    ''' This function takes in telco data frame and returns a histoplot that
    graphs the percentage of Telco customer who have churn'''

    sns.set_style('white')
    

    c_percent = round(df.churn.value_counts(normalize=True)[1],3)* 100
    plt.title(f'Customers churn by {c_percent}%',fontsize=30,fontweight=100,color='midnightblue')
    sns.histplot(data =df, x='churn',element='bars',stat='percent',hue='churn',palette='cubehelix',)
    col_mean= round(df.churn.value_counts(normalize=True)[1],3)* 100
    plt.axhline(col_mean, label = 'Churn Rate',color='midnightblue',linestyle='dashed')
   
    plt.show();
get_churn_mean_bar(train)

In [None]:
get_churn_mean_bar(train)



In [None]:
train.churn.value_counts(normalize=True)[1]

### It appears that about 26% of telco customers churn.

# Baseline

In [None]:
#Find Baseline
train.churn.value_counts()
#Baseline is 0, customer did not churn

In [None]:
#Baseline Accuracy
baseline = (train.churn == 'No').mean()
baseline

# Does having a high monthly charge affect churn?

### I appears that monthly charges has some bearing on customer churn. Customers who have less monthly charges appear to have stayed with Telco.


In [None]:
train.monthly_charges.describe()

In [None]:
plt.figure(figsize=(12,6))
plt.title('Monthly Charges vs Churn')
sns.histplot(x='monthly_charges', data=train, hue='churn',multiple='dodge', kde= True, bins = 6)

In [None]:
print(sns.color_palette("cubehelix").as_hex())

In [None]:
def get_monthly_charges(df):
    plt.title('Monthly Charges vs Churn')
    sns.boxplot(x=df.monthly_charges, y=df.churn,
                whis=np.inf, palette='cubehelix');
get_monthly_charges(train)

### Stat Test T-test

In [None]:
# Create an array with the colors you want to use
colors = ['#98b49c', '#e7c7e2']
# Set your custom color palette
sns.set_palette(sns.color_palette(colors))
get_monthly_charges(train)

In [None]:
train.churn.value_counts()

In [None]:
train[train.churn == 'Yes']

In [None]:
'''
Hypothesis 
Does monthly charges have a relationship with churn? 

Variables:
* monthly charges (continuous)
* churn(discrete)

Test: T-test two tail one sample scipy.stats.ttest_ind

$H_0$: Mean tenure of Telco customers who churn == mean tenure of Telco customers who do not churn.

$H_a$: Tean tenure of Telco customers who churn != mean tenure of Telco customers who do not churn.
'''

subset_churn =train[train.churn=='Yes']
subset_notchurn = train[train.churn =='No']

# # stats Levene test - returns p value. small p-value means unequal variances
stat, pval =stats.levene(subset_churn.monthly_charges, subset_notchurn.monthly_charges)


# high p-value suggests that the populations have equal variances

if pval < 0.05:
    print('inequal variance ==> set equl_var to False')
pval

alpha = 0.05

t_stat, p_val = stats.ttest_ind(subset_churn.monthly_charges, subset_notchurn.monthly_charges, equal_var = False)
print(f' t-stat:{stat}')
print(f' p-value:{pval}')

if pval/2 < 0.05:
    print('we can reject H0 ')
    
print(f'''
Because the p-value ({p_val}) is less than alpha value ({alpha}), we reject the null hypothesis''')

In [None]:
subset_churn.shape

## T TEST

In [None]:
def get_ttest_monthly_charges(df):
    
    # create two independent sample group of customers: churn and not churn.
    subset_churn =df[df.churn=='Yes']
    subset_notchurn = df[df.churn =='No']

    # # stats Levene test - returns p value. small p-value means unequal variances
    stat, pval =stats.levene(subset_churn.monthly_charges, subset_notchurn.monthly_charges)


    # high p-value suggests that the populations have equal variances
    if pval < 0.05:
        variance = True
    else:
        variance = False

 
    alpha = 0.05

    t_stat, p_val = stats.ttest_ind(subset_churn.monthly_charges, subset_notchurn.monthly_charges, equal_var = True,random_state=123)
    t_stat = t_stat.round(4)
    p_val = p_val.round(4)
    print(f' t-stat:{t_stat}')
    print(f' p-value:{p_val}')

   

get_ttest_monthly_charges(train)

### REject null hypothesis there is significant difference between the means of of monthly charges of those who churn vs those who do not churn

#  Do Senior Citizens churn more than non-Senior Citizens?

In [None]:
sub_issenior = train[train.senior_citizen== 1]
sub_notsenior = train[train.senior_citizen == 0]

In [None]:
sns.distplot(x=train.senior_citizen,norm_hist=True,);

In [None]:
sub_issenior.describe()

In [None]:
sub_notsenior.churn.value_counts(normalize= True)

In [None]:
sub_notsenior.churn.value_counts(normalize= True)

In [None]:
sub_issenior.churn.value_counts(normalize = True)

In [None]:
train.senior_citizen

In [None]:
def senior(train): 
    senior= train.senior_citizen.map({1:'Yes', 0: 'No'})
    sns.histplot(data = train, x= senior, stat="percent", multiple="dodge", shrink=.8, hue='churn')
    plt.show()

senior(train)

In [None]:
train.senior_citizen

In [None]:
sns.barplot(x='senior_citizen', y="churn", hue="churn", 
                  data=train, ci=None)

In [None]:
fig, ax = plt.subplots()
ax =sns.histplot(data = train, x= 'senior_citizen', stat="percent", multiple="dodge", shrink=.8, hue='churn')
ax.bar(['Not Senior Citizen', 'Senior Citizen'],0)


# sns.histplot(data = train, x=train.senior_citizen == 0, stat="percent")

In [None]:
train.senior_citizen.head()

In [None]:
sns.displot(data = train, x='churn', hue ='senior_citizen', col='senior_citizen',palette= 'cubehelix')
plt.legend(train.churn)

In [None]:
sns.displot()

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(data = train, x= train.senior_citizen,bins=2 , stat="percent", multiple="dodge",  hue='churn')
ax.bar(['Not Senior Citizen', 'Senior Citizen'],0)
#plt.xticks(np.arange(0,4,1));


In [None]:
sns.histplot(data =train, cbar=True,x='senior_citizen',stat='percent',hue='churn', palette='cubehelix',multiple='dodge')


In [None]:
senior_no= train[senior == 'No']
senior_yes = train[senior== 'Yes']

In [None]:
train[train.senior_citizen==1]

In [None]:

# change encoding of senior citizen to text
senior= train.senior_citizen.map({1:'Yes', 0: 'No'})

sns.countplot(x=senior, data=train, hue = 'churn',dodge=False)

# Title
plt.suptitle('Senior Citizens Churn More',fontsize=25,fontweight=100,color='midnightblue')
    
plt.show();
    

In [None]:

def get_bar_senior(df):
  
    plt.figure(figsize=(10,5))

    
    # change encoding of senior citizen to text
    senior= df.senior_citizen.map({1:'Yes', 0: 'No'})
    
    # Set your custom color palette and font size
    colors = ['#6BAF8E', '#E6AFC9']
    sns.set(font_scale=1.5)    
    sns.set_palette(sns.color_palette(colors))
    sns.set_style('white')
    
    plt.subplot(1,2,1)
    sns.countplot(x=senior, data=train, hue = 'churn')


    plt.subplot(1,2,2)
    sns.countplot(x=senior, data=train, hue = 'churn',dodge=False)

    # Title
    plt.suptitle('Senior Citizens Churn More',fontsize=25,fontweight=100,color='midnightblue')
    
    plt.show();
    
get_bar_senior(train)

### Stats Test Chi2 test

Do customers who are consider seniors churn more than customers who are not seniors?

Variables:

    * seniors (discrete)
    * churn (discrete)
    
Test: chi^2

$H_0$: There is **no** relationship between a customers with senior status and churn.

$H_a$: There is a relationship between a customers senior status and churn.

In [None]:
def get_chi2_senior(df):    
    # Chi-Square test to compare two categorical variables (senior citizen status, churn)

    alpha = 0.05

    # Setup a crosstab of observed 
    observed = pd.crosstab(df.senior_citizen== 1, train.churn)

    chi2, p, degf, expected = stats.chi2_contingency(observed)

    chi2 = chi2.round(4)
    p = p.round(4)
    print(f' Chi-Square:{chi2}')
    print(f' p-value:{p}')
    
get_chi2_senior(train)

In [None]:
    # Chi-Square test to compare two categorical variables (senior citizen status, churn)

    alpha = 0.05

    # Setup a crosstab of observed 
    observed = pd.crosstab(train.senior_citizen==1, train.churn)

    chi2, p, degf, expected = stats.chi2_contingency(observed)

    if p < alpha:
        print("Reject the null hypothesis")

    else:
        print("Fail to reject the null")
        print("Insufficient evidence to reject the null")
    chi2,p

### Since we reject the null hypethesis there seems an significant association between customers who are senior citizens and churn.

# Does tenure affect churn?

In [None]:
def get_boxplot_tenure(df):    
    plt.title('Tenure vs Churn')
    sns.boxplot(y=df.tenure, x=df.churn,palette='cubehelix',whis=np.inf);
get_boxplot_tenure(train)

In [None]:
plt.title('Tenure vs Churn')
sns.boxplot(y=train.tenure, x=train.churn,saturation=.5,palette='gist_ncar',
            whis=np.inf
           )

### Stats Test T-test

In [None]:
'''
Hypothesis 
Does tenure have a relationship with churn? 

Variables:
* tenure (continuous)
* churn(discrete)

Test: T-test two tail one sample scipy.stats.ttest_ind

$H_0$: Mean tenure of Telco customers who churn == mean tenure of Telco customers who do not churn.

$H_a$: Tean tenure of Telco customers who churn != mean tenure of Telco customers who do not churn.
'''
subset_churn =train[train.churn=='Yes']
subset_notchurn = train[train.churn =='No']

# # stats Levene test - returns p value. small p-value means unequal variances
stats.levene(subset_churn.tenure, subset_notchurn.tenure)

# high p-value suggests that the populations have equal variances

if pval < 0.05:
    print('inequal variance ==> set equl_var to False')
else:
    print('False')
pval

alpha = 0.05

t_stat, p_val = stats.ttest_ind(subset_churn.tenure, subset_notchurn.tenure, equal_var = False,random_state=123)

print(f' t-stat:{t_stat}')
print(f' p-value:{p_val}')

if pval < 0.05:
    print('we can reject H0 ')
    
print(f'''
Because the p-value ({p_val}) is less than alpha value ({alpha}), we reject the null hypothesis''')

In [None]:
# Second Go_______________________
'''
Hypothesis 
Do customers who churn have lower tenure?

Variables:
* tenure (continuous)
* churn(discrete)

Test: T-test one tail one sample scipy.stats.ttest_ind

$H_0$: Mean tenure of Telco customers who churn >= mean tenure of Telco customers who do not churn.

$H_a$: Tean tenure of Telco customers who churn < mean tenure of Telco customers who do not churn.
'''
subset_churn =train[train.churn=='Yes']
subset_notchurn = train[train.churn =='No']

# # stats Levene test - returns p value. small p-value means unequal variances
stats.levene(subset_churn.tenure, subset_notchurn.tenure)

# high p-value suggests that the populations have equal variances

if pval < 0.05:
    print('inequal variance ==> set equl_var to False')
else:
    print('False')
pval

alpha = 0.05

t_stat, p_val = stats.ttest_ind(subset_churn.tenure, subset_notchurn.tenure, equal_var = False,random_state=123)

print(f' t-stat:{t_stat}')
print(f' p-value:{p_val}')

if pval/2 < 0.05:
    print('we can reject H0 ')
    
print(f'''
Because the p-value ({p_val}) is less than alpha value ({alpha}), we reject the null hypothesis''')

In [None]:
def get_ttest_tenure(df):
    
    # create two independent sample group of customers: churn and not churn.
    subset_churn =df[df.churn=='Yes']
    subset_notchurn = df[df.churn =='No']

    # # stats Levene test - returns p value. small p-value means unequal variances
    stat, pval =stats.levene(subset_churn.tenure, subset_notchurn.tenure)


    # high p-value suggests that the populations have equal variances
    if pval < 0.05:
        variance = False
        print('False')
    else:
        variance = True
        print('True')

 
    alpha = 0.05

    t_stat, p_val = stats.ttest_ind(subset_churn.tenure, subset_notchurn.tenure, equal_var = variance,random_state=123)
    #t_stat = t_stat.round(4)
    #p_val = p_val.round(4)
    print(f' t-stat:{t_stat}')
    print(f' p-value:{p_val}')


In [None]:
get_ttest_tenure(train)

### Since we reject the null hypothesis there appears to be a significant difference in the tenure means of customers who churn and those who do not churn

# Does the contract type of customer affect churn?

In [None]:
plt.title('Contract Type vs Churn')
sns.countplot(x=train.contract_type, data=train, hue = 'churn',palette='cubehelix');


In [None]:
def get_plot_contract(df):
    plt.title('Contract Type vs Churn')
    sns.countplot(x=train.contract_type, data=train, hue = 'churn',palette='cubehelix');
get_plot_contract(train)

In [None]:
plt.title('Contract Type vs Churn')
sns.histplot(data = train, x=train.contract_type, stat="percent", hue = 'churn', multiple = 'dodge');

In [None]:
train.contract_type.value_counts()

In [None]:
pd.crosstab(train.contract_type, train.churn)

### Stats TEST Chi2

In [None]:
# Chi-Square test to compare two categorical variables (Contract type vs Churn)

alpha = 0.05

# Setup a crosstab of observed 
observed = pd.crosstab(train.contract_type, train.churn)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis")
    
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p, chi2

In [None]:
def get_chi2_contract(df):    
    # Chi-Square test to compare two categorical variables (contract type, churn)
    # Set alpha to 0.05
    alpha = 0.05

    # Setup a crosstab of observed 
    observed = pd.crosstab(df.contract_type, df.churn)
    
    # Run chi-square test
    chi2, p, degf, expected = stats.chi2_contingency(observed)
    
    # Round and Print Results
    chi2 = chi2.round(4)
    p = p.round(4)
    print(f' Chi-Square:{chi2}')
    print(f' p-value:{p}')
    
get_chi2_contract(train)

### There appears to be a significant association between contract type and churn

# Do Senior Citizens pay more monthly charges that non Senior Citizens?

In [None]:
plt.title('High monthly charges Drive Senior Citizens to Churn')
sns.barplot(data= train, x='senior_citizen', y='monthly_charges', hue = 'churn')

Hypothesis for Senior citizens and Monthly charges
Do customers who are consider seniors pay more monthly charges than customers who are not seniors? 

Variables:
* seniors (discrete)
* month charges(continuous)

Test: two sample, one tail, scipy.stats.ttest_ind

$H_0$: The mean monthly charges for senior citizens <= to the mean monthly charges of non senior citizens.

$H_a$: The mean monthly charges for senior citizens <= to the mean monthly charges of non senior citizens.

In [None]:
total_charges_seniors = train[train.senior_citizen==1].total_charges
total_charges_nonseniors = train[train.senior_citizen==0].total_charges

In [None]:
total_charges_seniors.var(),total_charges_nonseniors.var()

In [None]:
# # stats Levene test - returns p value. small p-value means unequal variances
stat, pval = stats.levene(total_charges_seniors, total_charges_nonseniors)

# high p-value suggests that the populations have equal variances

if pval < 0.05:
    print('inequal variance ==> set equal_var to False')
pval

In [None]:
alpha = 0.05

t_stat, p_val = stats.ttest_ind(total_charges_seniors, total_charges_nonseniors, equal_var = False)
print(f' t-stat:{stat}')
print(f' p-value:{pval}')

if pval/2 < 0.05:
    print('we can reject H0 ')
    
print(f'''
Because the p-value ({p_val}) is less than alpha value ({alpha}), we reject the null hypothesis''')

### Summary:
### Since we reject the null hypothesis, there seems to be significant findings that senior citizens on average do pay more Monthly Charges than non senior citizens.m

# Does gender influence churn?

In [None]:
plt.title('Gender Vs Churn')
sns.countplot(x=train.gender, data=train, hue = 'churn', palette='cubehelix')

plt.legend()
plt.show()

In [None]:
def get_plot_gender(df):   
    plt.title('Gender Vs Churn')
    sns.countplot(x=df.gender, data=df, hue = 'churn', palette='cubehelix')

    plt.legend()
    plt.show();
get_plot_gender(train)

There does not seem to be a relationhip between churn and gender

### Stats Test Chi2

In [None]:
'''
Hypothesis 
Gender vs churn

Variables:
* gender(discrete)
* churn(discrete)

Test: 

$H_0$: There is **no** relationship between a customers and tenure.

$H_a$: There is a relationship between a customers and tenure.
'''
    
print (pd.crosstab(train.contract_type, train.churn))

# Chi-Square test to compare two categorical variables (Sex and Survival)


alpha = 0.05

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(train.gender, train.churn)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis")
    
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p,chi2

In [None]:
def get_chi2_gender(df):    
    # Chi-Square test to compare two categorical variables (gender, churn)
    # Set alpha to 0.05
    alpha = 0.05

    # Setup a crosstab of observed 
    observed = pd.crosstab(df.gender, df.churn)
    
    # Run chi-square test
    chi2, p, degf, expected = stats.chi2_contingency(observed)
    
    # Round and Print Results
    chi2 = chi2.round(4)
    p = p.round(4)
    print(f' Chi-Square:{chi2}')
    print(f' p-value:{p}')
    
get_chi2_gender(train)

WE fail to reject the null hypothesis so threre is no association between gender and churn

### It does not appear like gender has and influence on churn

# Does partner affect the churn?

In [None]:
plt.title('Partner Vs Churn')
sns.countplot(x=train.partner, data=train, hue = 'churn')

plt.legend()
plt.show()

In [None]:
def get_plot_partner(df):    
    plt.title('Partner Vs Churn')
    sns.countplot(x=df.partner, data=df, hue = 'churn', palette='cubehelix')

    plt.legend()
    plt.show();
    
get_plot_partner(train)

### There seems to be a relationship between partner and churn

# Stats Test Chi2

In [None]:
'''
Hypothesis 
partner vs churn

Variables:
* partner(discrete)
* churn(discrete)

Test: 

$H_0$: There is **no** relationship between a customers and partner

$H_a$: There is a relationship between a customers and partner.
'''
    
print (pd.crosstab(train.partner, train.churn))

# Chi-Square test to compare two categorical variables (Sex and Survival)


alpha = 0.05

# Setup a crosstab of observed survival to pclass
observed = pd.crosstab(train.partner, train.churn)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("Reject the null hypothesis")
    
else:
    print("Fail to reject the null")
    print("Insufficient evidence to reject the null")
p

In [None]:
def get_chi2_partner(df):    
    # Chi-Square test to compare two categorical variables (gender, churn)
    # Set alpha to 0.05
    alpha = 0.05

    # Setup a crosstab of observed 
    observed = pd.crosstab(df.partner, df.churn)
    
    # Run chi-square test
    chi2, p, degf, expected = stats.chi2_contingency(observed)
    
    # Round and Print Results
    chi2 = chi2.round(4)
    p = p.round(4)
    print(f' Chi-Square:{chi2}')
    print(f' p-value:{p}')
    
get_chi2_partner(train)

### WE reject the null hypothesis so there is an association between partner anc churn

# Exploration Summary
* Monthly Charges is a driver of churn
* Senior Citizen status is a driver of churn
* Tenure is a driver of churn
* Contract type is a driver of churn
* Partner is a driver of churn
* Gender is not a driver of churn



# Features that will be included in my model

* Monthly charges  has a significant statistical relationship to churn
* Senior Citizen  has a significant statistical relationship to churn
* Tenure  has a significant statistical relationship to churn
* Contract type has a significan statistical relationship to churn
* Partner  has a significant statistical relationship to churn

# Features that will not be included in my model

* Gender did not have a statistical significant relationship to churn.
* Other features have unknow significance to churn at the moment
    * given more time to I would determined significance to churn.

# Modeling

* Accuracy is the metric use in the models
* Churn customers makeup 26.5% of the data 
* by guessing non-churn for every customer one could achieve an accuracy of 73.5%
* 73.5% will be the baseline accuracy I use for this project 

* I will be evaluating models developed using four different model types and various hyperparameter configurations

* Models will be evaluated on train and validate data

* The model that performs the best will then be evaluated on test data

In [None]:
train.columns.to_list()

In [None]:
def model_prep(train,validate,test):
    
        # drop unused columns 
        features = ['monthly_charges','senior_citizen','tenure','partner_No','partner_Yes','churn','contract_type_Month-to-month','contract_type_One year','contract_type_Two year']

        train = train[features]
        validate = validate[features]
        test = test[features]
        
       

        
        #seperate target
        
         
        x_train = train.drop(columns=['churn'])
        y_train = train.churn

        x_validate = validate.drop(columns=['churn'])
        y_validate = validate.churn

        x_test = test.drop(columns=['churn'])
        y_test = test.churn
        
        # Convert binary categorical targer variable to numeric
        y_train.churn= train.churn.map({'Yes': 1, 'No': 0})
        y_validate.churn = validate.churn.map({'Yes': 1, 'No': 0})
        y_test.churn = test.churn.map({'Yes': 1, 'No': 0})
        
       

        return x_train,y_train,x_validate,y_validate, x_test, y_test

In [None]:
# prep data for modeling
x_train,y_train,x_validate,y_validate, x_test, y_test = model_prep(train,validate,test)

In [None]:
x_train.shape, x_validate.shape, x_test.shape

# Decision Tree

In [None]:
DecisionTreeClassifier?


In [None]:

metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(x_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(x_train, y_train)
    
    out_of_sample_accuracy = tree.score(x_validate, y_validate)

    output = {
        "i": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df_2 = pd.DataFrame(metrics)
df_2["difference"] = df_2.train_accuracy - df_2.validate_accuracy
df_2

In [None]:
df_2[df_2.train_accuracy > .77]

### Decision Tree Best Model

In [None]:
# for decision tree the best model has  max_depth of 3.
# 3	0.789942	0.776659	0.01328
tree = DecisionTreeClassifier(max_depth=3, random_state=123)

# Fit the model (on train and only train)
tree = tree.fit(x_train, y_train)

In [None]:
Accuracy of Decision Tree on train data is 0.789942
Accuracy of Decision Tree on validate data is 0.776659

In [None]:
Decision Tree accuracy is about 

In [None]:
def get_tree_model(x_train,y_train,x_validate,y_validate):
    ''' This function takes in train data and validate data and returns models accuracy score.
        Train data  is used tofit Decision Tree Model. Both train and validate data is used
        to return the accuracy score of the Decision Tree Model
    '''
    
    tree = DecisionTreeClassifier(max_depth=3, random_state=123)
    tree = tree.fit(x_train, y_train)
    print(f"Accuracy of Decision Tree on train data is {tree.score(x_train, y_train)}")
    print(f"Accuracy of Decision Tree on validate data is {tree.score(x_validate, y_validate)}")

get_tree_model(x_train,y_train,x_validate,y_validate)

# Random Forest

In [None]:
metrics=[]
for h in range(10,0,-1):
   
    print(f'max depth {h}')
    
    for i in range(1, 21):
  
    # Make the model
        random_forest = RandomForestClassifier(max_depth=h, min_samples_leaf = i , random_state=123)
    
    # Fit the model (on train and only train)
        random_forest.fit(x_train, y_train)
        
         # Use the model
    # We'll evaluate the model's performance on train, first
        in_sample_accuracy = random_forest.score(x_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
        y_pred = random_forest.predict(x_train)
        
        output = {
        "max_depth": h,
        "min_samples_leaf": i,
        "train_accuracy": in_sample_accuracy,
        #"validate_accuracy": y_pred
    }

        metrics.append(output)
        
        df = pd.DataFrame(metrics)
df
  
 

In [None]:
df[df.train_accuracy >.85]

In [None]:
metrics=[]
for h in range(10,0,-1):
   
    print(f'max depth {h}')
    
    for i in range(1, 21):
  
    # Make the model
        random_forest = RandomForestClassifier(max_depth=h, min_samples_leaf = i , random_state=123)
    
    # Fit the model (on train and only train)
        random_forest = random_forest.fit(x_train, y_train)
        
         # Use the model
    # We'll evaluate the model's performance on train, first
        in_sample_accuracy = random_forest.score(x_train, y_train)
        
    
        out_of_sample_accuracy = random_forest.score(x_validate, y_validate)


    # Use the model
    # We'll evaluate the model's performance on train, first
        y_pred = random_forest.predict(x_train)
        
        output = {
        "max_depth": h,
        "min_samples_leaf": i,
        "train_accuracy": in_sample_accuracy,
         "validate_accuracy": out_of_sample_accuracy   
      
    }

        metrics.append(output)
        
        df_2 = pd.DataFrame(metrics)



        df_2["difference"] = df_2.train_accuracy - df_2.validate_accuracy
df_2

In [None]:
df_2[df_2.difference<.01]

In [None]:
df_2[df_2.validate_accuracy>.784]

### Best Model Random Forest

In [None]:
#Random Forest best model validate on Accuracy data0.020737
# Make the model 8	9	0.821438	0.784360	0.037077
random_forest = RandomForestClassifier(max_depth=8, min_samples_leaf = 9 , random_state=123)
    
# Fit the model (on train and only train)
random_forest = random_forest.fit(x_train, y_train)

In [None]:
def get_random_forest_model(x_train,y_train,x_validate,y_validate):
    ''' This function takes in train data and validate data and returns models accuracy score.
        Train data  is used to fit Random Forest Model. Both train and validate data is used
        to return the accuracy score of the Random Forest Model
    '''
    
    # Set Random Forest Model parameters
    random_forest = RandomForestClassifier(max_depth=9, min_samples_leaf = 1 , random_state=123)
   
    # Use train data to fit Random Forest model
    random_forest = random_forest.fit(x_train, y_train)
    
    print(f"Accuracy of Decision Tree on train data is {random_forest.score(x_train, y_train)}")
    print(f"Accuracy of Decision Tree on validate data is {random_forest.score(x_validate, y_validate)}")

get_random_forest_model(x_train,y_train,x_validate,y_validate)

# KNN

In [None]:

metrics = []

for i in range(1,30):
    KNN = KNeighborsClassifier(n_neighbors=i, algorithm='brute')
    KNN.fit(x_train, y_train)
    
    model_accuracies = {
        'neighbor': i,
        'train_score': KNN.score(x_train, y_train),
        'validate_score': KNN.score(x_validate, y_validate)}
    
    metrics.append(model_accuracies)
    df = pd.DataFrame(metrics)


df

In [None]:
df.train_score-df.validate_score

# Best Model KNN

In [None]:
#6	0.82	0.77 validate accuracy
#10 	11	0.813056	0.773697

KNN = KNeighborsClassifier(n_neighbors=11)
KNN.fit(x_train, y_train)

# 24	25	0.801626	0.780213
KNN = KNeighborsClassifier(n_neighbors=25, algorithm='brute')
KNN.fit(x_train, y_train)


In [None]:
def get_knn_model(x_train,y_train,x_validate,y_validate):
    ''' This function takes in train data and validate data and returns the models accuracy score.
        Train data  is used to fit the KNN Model. Both train and validate data is used
        to return the accuracy score of for the KNN Model
    '''
    
    # Set Random Forest Model parameters
    KNN = KNeighborsClassifier(n_neighbors=25, algorithm='brute')
   
    # Use train data to fit Random Forest model
    KNN.fit(x_train, y_train)
    
    print(f'Accuracy of Decision Tree on train data is {KNN.score(x_train, y_train)} about {round(KNN.score(x_train, y_train)*100)}%')
    print(f'Accuracy of Decision Tree on validate data is {KNN.score(x_validate, y_validate)} about {round(KNN.score(x_validate, y_validate)*100)}%')

get_knn_model(x_train,y_train,x_validate,y_validate)

* 

# Logistic Regression

In [None]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=123)

In [None]:
#  fit the model on train data 
logit.fit(x_train, y_train)

In [None]:
y_pred = logit.predict(x_train)

# classification report for Model 2 using train data
print(classification_report(y_train, y_pred))


In [None]:
y_p = logit.predict(x_validate)

# classification report for Model 2 using train data
print(classification_report(y_validate, y_p))

In [None]:
# best model .78  on validate,
logit = LogisticRegression(C=1, random_state=123)

In [None]:
def get_logit_model(x_train,y_train,x_validate,y_validate):
    ''' This function takes in train data and validate data and returns the models accuracy score.
        Train data  is used to fit the Logistic Regression Model. Both train and validate data is used
        to return the accuracy score of for the Logistic Regression Model
    '''
    
    # Define the logistic regression model
    logit = LogisticRegression(C=1,random_state=123)
   
   
    # Use train data to fit Logistic Regression model
    logit.fit(x_train, y_train)
    
    diff = logit.score(x_train, y_train)-logit.score(x_validate, y_validate)
    
    print(f'Accuracy of Decision Tree on train data is {logit.score(x_train, y_train)} about {int(round(logit.score(x_train, y_train)*100,2)}%')
    print(f'Accuracy of Decision Tree on validate data is {logit.score(x_validate, y_validate)} about {round(logit.score(x_validate, y_validate)*100,2)}%')
    print(f'Difference: {round(diff,4)}')
    
get_logit_model(x_train,y_train,x_validate,y_validate)

In [None]:
# Define the logistic regression model
logit = LogisticRegression(C=1, solver='s',random_state=123)
#  fit the model on train data usingfeatures 
logit.fit(x_train, y_train)

y_pred = logit.predict(x_train)

# classification report for Model 2 using train data
print(classification_report(y_train, y_pred))

y_p = logit.predict(x_validate)

# classification report for Model 2 using train data
print(classification_report(y_validate, y_p))