# ModelPrepare Class to normalize our working ways

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import json
from enum import Enum 
from datetime import datetime
from sklearn.metrics import confusion_matrix, plot_confusion_matrix


# Enum Class for type checking and forcing
class Datasets(Enum):
    TRAINING_SET = 'train.csv'
    TESTING_SET = 'test.csv'

# Class to normalize the way of writing
class ModelPrep():
  
  training_to_testing_map = {
      "age": 'age',
      'work-class': 'workclass',
      'work-fnl': 'fnlwgt',
      'education': 'education',
      'education-num': 'education-num',
      'marital-status': 'marital-status',
      'position': 'occupation',
      'relationship': 'relationship',
      'race': 'race',
      'sex': 'sex',
      'capital-gain': 'capital-gain',
      'capital-loss': 'capital-loss',
      'hours-per-week': 'hours-per-week',
      'native-country': 'native-country'
  }

  def __init__(self) -> None:
    self.columns_properties = {}
    self.cache = self.load_cache()
    self.training_set = pd.read_csv(Datasets.TRAINING_SET.value) 
    self.testing_set = pd.read_csv(Datasets.TESTING_SET.value)
    self.testing_set.rename(columns = {'occupation':'position'}, inplace = True)
    self.testing_set.rename(columns = {'workclass':'work-class'}, inplace = True)

  def load_cache(self) -> dict:
    try: 
      with open('model_cache.json', 'r') as file:
        return json.load(file) 
    except:
      print("No Cache Exists yet")
      return {}
  
  def __update_cache(self, modelName: str, json_property: dict) -> None:
    self.cache[modelName] = json_property
    self.__write_cache()
  
  def __write_cache(self) -> None:
    with open('model_cache.json', 'w') as file:
        json.dump(self.cache, file)

  def __choose_target_set(self, dataset_type: Datasets) -> pd.DataFrame:
      return self.training_set if dataset_type == Datasets.TRAINING_SET else self.testing_set

  def head(self, dataset_type: Datasets) -> None:
    print(self.__choose_target_set(dataset_type).head())
  
  def shape(self, dataset_type: Datasets) -> tuple:
    return self.__choose_target_set(dataset_type).shape

  def remove_white_spaces(self):
    for target_dataset in self.training_set, self.testing_set:
      for i in target_dataset.columns:
        if target_dataset[i].dtype == 'object':
          try:
            target_dataset[i] = target_dataset[i].map(str.strip)
          except:
            pass

  def drop_column(self, column_name: str) -> None:
    for set in self.training_set, self.testing_set:
      try:
        set.drop(self.training_to_testing_map[column_name], axis=1, inplace=True)
      except:
        set.drop(column_name, axis=1, inplace=True)

  def drop_duplicates(self, dataset_type: Datasets) -> None:
    self.__choose_target_set(dataset_type).drop_duplicates() 
  
  def get_value_counts(self, dataset_type: Datasets, column_name: str):
    print(self.__choose_target_set(dataset_type)[column_name].value_counts())

  def replace_null(self, null_place_holder):
    for target_set in self.training_set, self.testing_set:
      target_set[target_set=='?'] = np.nan

  def encoder(self, column_name, encoding_map):
    for target_set in self.training_set, self.testing_set:
      try:
        target_set[column_name] = target_set[column_name].map(encoding_map).astype(int)
      except:
        target_set[self.training_to_testing_map[column_name]] = target_set[self.training_to_testing_map[column_name]].map(encoding_map).astype(int)

  def change_column_value(self,  column_name, from_value, to_value):
    for target_set in self.training_set, self.testing_set:
      try:
        target_set[column_name].replace(from_value, to_value, inplace=True)
      except:
        target_set[self.training_to_testing_map[column_name]].replace(from_value, to_value, inplace=True)

  def change_column_value_not(self, column_name, not_equal_value, equal_value):
      for target_set in self.training_set, self.testing_set:
        try:
          target_set.loc[target_set[column_name] != not_equal_value, column_name] = equal_value
        except:
          target_set.loc[target_set[self.training_to_testing_map[column_name]] != not_equal_value, column_name] = equal_value

  def column_values_with_salary(self, column_name):
    print(self.training_set.groupby(column_name)['salary']\
          .value_counts(normalize=True).mul(100).round(1).astype(str) + '%')
  
  def draw_correlation(self):
    corr = self.training_set.corr()
    fig, ax = plt.subplots(figsize=(10,10))  
    ax = sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values, annot=True, fmt=".1%", linewidths=1.0, square=1)

  
  def draw_crosstab_with_salary(self, column_name: str, title: str, x_label: str, y_label: str) -> None:
    pd.crosstab(self.training_set[column_name], 
                self.training_set['salary']).plot(kind='bar', figsize=(20,10), stacked=True)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
  
  def model_test_split(self, test_size):
    self.model_x_train = self.training_set.drop(['salary'], axis=1)
    self.model_y_train = self.training_set['salary']
    return train_test_split(self.model_x_train, self.model_y_train, test_size=test_size)

  def machinelearning_model(self, model_name, classification_model, x_train,  x_test, y_train, y_test, **kwargs):
    model = classification_model(**kwargs)
    model.fit(x_train,y_train)
    accuracy = model.score(x_test,y_test)
    print(f"Accuracy of model on testing data: {accuracy}")
    pred = model.predict(x_test)
    confusion_matrix(y_test, pred)
    kwargs["Time Trained"] = str(datetime.now())
    kwargs["Accuracy"] = accuracy
    self.__update_cache(model_name, kwargs)
    matrix = plot_confusion_matrix(model, x_test, y_test, cmap=plt.cm.Blues)
    color = 'blue'
    matrix.ax_.set_title('Confusion Matrix', color=color)
    plt.xlabel('Predicted Label', color=color)
    plt.ylabel('True Label', color=color)
    plt.gcf().axes[0].tick_params(colors=color)
    plt.gcf().axes[1].tick_params(colors=color)
    plt.show()
    return model

  def prepare_submission(self, model, modelName: str):
    pred = pd.DataFrame(model.predict(self.testing_set),columns=["salary"])
    pred.index = self.testing_set.index
    pred.columns = ["salary"]
    pred['salary'].replace([0,1],[' <=50K',' >50K'],inplace=True)
    pred.index.names = ['index']
    pred.to_csv(f"submission-{modelName}.csv")

# Pre-processing

First Step of Pre-processing, let's initialize our model class.

In [None]:
employeeSalaryModel = ModelPrep()

## Explore Data: 

First let's take a look at data

In [None]:
# Getting Overview of Training Data
employeeSalaryModel.head(Datasets.TRAINING_SET)

We can see our columns there and we can see from the first glance there are some columns that need to be cleaned like Captial Gain and Capital Loss, however, this still doesn't give us full overview of the data.

We will look into shape to know how many columns and rows we have.


In [None]:
# Getting DataSet Size: 
print(employeeSalaryModel.shape(Datasets.TRAINING_SET))

Also, after inspecting we found that there are some trailing spaces in the values, we will start removing those.

In [None]:
employeeSalaryModel.remove_white_spaces()

Let's look at the value counts of each column in the dataset!

In [None]:
for column in employeeSalaryModel.training_set.columns:
  print(f"Column: {column}")
  employeeSalaryModel.get_value_counts(Datasets.TRAINING_SET, column)
  print("")

We can see that we have several "?" in Dataset, let's change it into null.

Replace ? to Null and analyze the null values

In [None]:
employeeSalaryModel.replace_null('?')

Let's analyze how values are in the Dataset

In [None]:
employeeSalaryModel.training_set.isnull().sum()

In [None]:
employeeSalaryModel.training_set.nunique()

# Columns Analysis

In [None]:
# Before we start, let's drop the duplicate rows.
employeeSalaryModel.drop_duplicates(Datasets.TRAINING_SET) 

## Age

In [None]:
employeeSalaryModel.training_set['age'].plot(kind='kde')

The age distribution of our dataset is Positive skewed. (Mean is greater than median)

In [None]:
sns.FacetGrid(employeeSalaryModel.training_set, hue="salary", height=6, ).map(sns.kdeplot, "age", shade=True).add_legend()
plt.show()

As we look more deeper we can see that the age of people who makes <=50k has a left skewed distribution while people who make >50k has a normal distribution.

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('age', 'Age vs Salary', 'Age', 'Age vs Salary')

## Work-fnl

Work-fnl has 16k unique values which means it is probably kind of id or something. Therefore, we can safely drop it.

In [None]:
employeeSalaryModel.drop_column('work-fnl')

## Work class

Let's analyze the different values of work class columns with salary,and let's not forget that we have null values in it. First, let's assign those null values with unknown for now to be able to analyze them. 

In [None]:
employeeSalaryModel.change_column_value('work-class', np.nan, 'Unkown')

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('work-class', 'Work Class vs Salary', 'Work class', 'Work Class vs Salary')

We see most of people are in private sectors, and most of people in self-emp earn more than 50%, we can take a better look into values

In [None]:
employeeSalaryModel.column_values_with_salary('work-class')

We have here some interesting things. We can see that people without-pay and never-worked will have 100% as less than 50k, so they can be grouped in 1 column. 

The percentage of money earned in government-related jobs are close to each other, so we can group them as well. 

44% of People who are  self employed and in company earn more than 50k!  

In [None]:
employeeSalaryModel.change_column_value('work-class', 'Without-pay', 'No pay')
employeeSalaryModel.change_column_value('work-class', 'Never-worked', 'No pay')
employeeSalaryModel.change_column_value('work-class', 'Federal-gov', 'gov-work')
employeeSalaryModel.change_column_value('work-class', 'Local-gov', 'gov-work')
employeeSalaryModel.change_column_value('work-class', 'State-gov', 'gov-work')

In [None]:
employeeSalaryModel.column_values_with_salary('work-class')

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('work-class', 'Work Class vs Salary', 'Work class', 'Work Class vs Salary')

In [None]:
work_map = {'No pay': 0, 'Private': 1, 'Self-emp-inc': 2, 'Self-emp-not-inc': 3, 'Unkown': 4, 'gov-work': 5}
employeeSalaryModel.encoder('work-class', work_map)

## Position

In [None]:
employeeSalaryModel.change_column_value('position', np.nan, 'Unkown')

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('position', 'Position vs Salary', 'Position', 'Position vs Salary')

In [None]:
employeeSalaryModel.column_values_with_salary('position')

We have some patterns in jobs, frmo 80% to 100% low paying, from 60 to 80 Medium and the rest are high 

In [None]:
employeeSalaryModel.change_column_value('position', 'Adm-clerical', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Armed-Forces', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Unkown', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Farming-fishing', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Machine-op-inspct', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Other-service', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Priv-house-serv', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Handlers-cleaners', 'Low Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Transport-moving', 'Medium Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Sales', 'Medium Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Tech-support', 'Medium Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Craft-repair', 'Medium Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Protective-serv', 'Medium Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Exec-managerial', 'High Paying Jobs')
employeeSalaryModel.change_column_value('position', 'Prof-specialty', 'High Paying Jobs')

In [None]:
employeeSalaryModel.column_values_with_salary('position')

In [None]:
encoder = {'Low Paying Jobs': 0, 'Medium Paying Jobs': 1, 'High Paying Jobs': 2}
employeeSalaryModel.encoder('position', encoder)

## Marital Status

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('marital-status', 'Marital Status vs Salary', 'Marital Status', 'Marital Status vs Salary')

NameError: ignored

Interestingly, the married couples have higher percentage of earning over 50k. Let's see that in details

In [None]:
employeeSalaryModel.column_values_with_salary('marital-status')

* 90% of both widowed, divorced, and seperated, married spouse absent earn less than 50k, so this cna be grouped in 1 group.
* 2 group of married people: Airforces and civilians exist and their percentage is earnings are cose, so we can group them.

In [None]:
employeeSalaryModel.change_column_value('marital-status', 'Widowed', 'Unmarried')
employeeSalaryModel.change_column_value('marital-status', 'Separated', 'Unmarried')
employeeSalaryModel.change_column_value('marital-status', 'Never-married', 'Unmarried')
employeeSalaryModel.change_column_value('marital-status', 'Married-spouse-absent', 'Unmarried')
employeeSalaryModel.change_column_value('marital-status', 'Divorced', 'Unmarried')
employeeSalaryModel.change_column_value('marital-status', 'Married-civ-spouse', 'married')
employeeSalaryModel.change_column_value('marital-status', 'Married-AF-spouse', 'married')

Let's check the new graph now:

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('marital-status', 'Marital Status vs Salary', 'Marital Status', 'Marital Status vs Salary')

In [None]:
employeeSalaryModel.column_values_with_salary('marital-status')

NameError: ignored

We can see the percentage is slightly the same

Let's encode the values!

In [None]:
married_couple_map = {'Unmarried': 0, 'married': 1}
employeeSalaryModel.encoder('marital-status', married_couple_map)

NameError: ignored

## Relationship

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('relationship', 'relationship vs Salary', 'relationship', 'relationship vs Salary')

* As expected frmo our previous analysis in marital status! Husbands and wives have higher percentage in making over 50%. However, the column own-child is kinda unqiue, because maybe someone has child and divorced. However, we can change Husband and wife to Married!

In [None]:
employeeSalaryModel.column_values_with_salary('relationship')

* Interesting enough, we can change own-child, unmarried and other-relative to all unmarried, as their percentages are really close!

In [None]:
employeeSalaryModel.change_column_value('relationship', 'Other-relative', 'Unmarried')
employeeSalaryModel.change_column_value('relationship', 'Own-child', 'Unmarried')
employeeSalaryModel.change_column_value('relationship', 'Wife', 'Married')
employeeSalaryModel.change_column_value('relationship', 'Husband', 'Married')

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('relationship', 'relationship vs Salary', 'relationship', 'relationship vs Salary')

Let's encode the rest!

In [None]:
relationship_encoder = {'Married': 0, 'Not-in-family': 1, 'Unmarried': 2}
employeeSalaryModel.encoder('relationship', relationship_encoder)

## Education & Education Num

From the analysis, it is obvious that education and education num are the same! Let's drop the education num and do our own encoding of education after analysis!

In [None]:
employeeSalaryModel.drop_column('education-num')

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('education', 'Education vs Salary', 'Education', 'Education vs Salary')

* Of course, we can see that any people still in school will hardly ever have any salary above 50k 

In [None]:
employeeSalaryModel.column_values_with_salary('education')

* Also, Doctorates and prof-school has similar percentages in earning over 50k as well as HS-grad and some college has similar percentage in earning less than 50k.

* We can drop those rows with students earning more than 50k and yet in 5th grade -> CAN! 

* We need to check the age of those education people

In [None]:
pd.crosstab(employeeSalaryModel.training_set['education'], 
            employeeSalaryModel.training_set['age']).plot(kind='bar', figsize=(20,10), stacked=True)
plt.title('Age vs Education')
plt.xlabel('Education')
plt.ylabel('Age')

* That's interesting, so we have most of the people in 5th-6th grade are actualy over 25 years old, which means they are school students at best! Therefore, we can start grouping those categories. Also, group associates together. And group HS-grad with Some-college because both of them stopped at HS with similar percentages, and last but not least doctorates with prof-school as they have similar percentages!

In [None]:
employeeSalaryModel.change_column_value('education', '1st-4th', 'School')
employeeSalaryModel.change_column_value('education', '5th-6th', 'School')
employeeSalaryModel.change_column_value('education', '7th-8th', 'School')
employeeSalaryModel.change_column_value('education', '9th', 'School')
employeeSalaryModel.change_column_value('education', '10th', 'School')
employeeSalaryModel.change_column_value('education', '11th', 'School')
employeeSalaryModel.change_column_value('education', '12th', 'School')
employeeSalaryModel.change_column_value('education', 'Assoc-acdm', 'Associates')
employeeSalaryModel.change_column_value('education', 'Assoc-voc', 'Associates')
employeeSalaryModel.change_column_value('education', 'Some-college', 'HS')
employeeSalaryModel.change_column_value('education', 'HS-grad', 'HS')
employeeSalaryModel.change_column_value('education', 'Prof-school', 'Prof')
employeeSalaryModel.change_column_value('education', 'Doctorate', 'Prof')

In [None]:
employeeSalaryModel.column_values_with_salary('education')

We can then start re-encoding the values!

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('education', 'Education vs Salary', 'Education', 'Education vs Salary')

In [None]:
education_encoding = {'Preschool': 0, 'School': 1, 'HS': 2, 'Bachelors': 3, 'Associates': 4, 'Prof': 5, 'Masters': 6}
employeeSalaryModel.encoder('education', education_encoding)

## Native Country

From the values we thought in the early beginning, we cna see that US dominates this Dataset! We can see that over 85% of the DS comes from the US. This means, we have a bias in our dataset. We will leave it now and look at the correlation heatmap to determine if we are gonna drop it. 

However, we see that there are a couple of nulls there! Also, we can see that some countries represent so low in DS! So, we can make all the countries as others and US alone 

In [None]:
employeeSalaryModel.change_column_value_not('native-country', 'United-States', 'Others')

Let's take a look at data for now!

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('native-country', "Native Country vs Salary", 'Native Country', 'Native Country vs Salary')

In [None]:
# Encoder
country_encoder = {'United-States': 0, 'Others': 1}
employeeSalaryModel.encoder('native-country', country_encoder)

## Gender

Let's check the gender, hopefully we will only have TWO genders!

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('sex', 'Gender vs Salary', 'Gender', 'Gender vs Salary')

In [None]:
#Encoder
genders_encoder = {'Male': 0, 'Female': 1}
employeeSalaryModel.encoder('sex', genders_encoder)

## Race

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('race', 'Race vs Salary', 'Race', 'Race vs Salary')

In [None]:
employeeSalaryModel.column_values_with_salary('race')

In [None]:
employeeSalaryModel.change_column_value('race', 'Amer-Indian-Eskimo', 'Other')

In [None]:
encoding_map = {'Other': 0, 'White': 1, 'Black': 2, 'Asian-Pac-Islander': 3}
employeeSalaryModel.encoder('race', encoding_map)

## Captial Gain

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('capital-gain', 'capital-gain vs Salary', 'capital-gain', 'capital-gain vs Salary')

In [None]:
from sklearn.preprocessing import minmax_scale

employeeSalaryModel.training_set['capital-gain'] = minmax_scale(employeeSalaryModel.training_set['capital-gain'])
employeeSalaryModel.testing_set['capital-gain'] = minmax_scale(employeeSalaryModel.testing_set['capital-gain'])

## Capital Loss

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('capital-gain', 'capital-gain vs Salary', 'capital-gain', 'capital-gain vs Salary')

In [None]:
employeeSalaryModel.training_set['capital-loss'] = minmax_scale(employeeSalaryModel.training_set['capital-loss'])
employeeSalaryModel.testing_set['capital-loss'] = minmax_scale(employeeSalaryModel.testing_set['capital-loss'])

## Hours Per Week

In [None]:
employeeSalaryModel.draw_crosstab_with_salary('hours-per-week', 'Hours Per Week vs Salary', 'Hours Per week', 'Hours Per Week vs Salary')

In [None]:
employeeSalaryModel.column_values_with_salary('hours-per-week')

## Correlation

In [None]:
salary_map = {'<=50K': 0, '>50K': 1}
employeeSalaryModel.training_set['salary'] = employeeSalaryModel.training_set['salary'].map(salary_map).astype(int)

In [None]:
employeeSalaryModel.draw_correlation()

**Correlation Intrepretation:**

We can see that Gender and Native-country as well as Race don't contribute 

In [None]:
employeeSalaryModel.drop_column('race')
employeeSalaryModel.drop_column('native-country')

#  Models

In [None]:
x_train, x_test, y_train,  y_test = employeeSalaryModel.model_test_split(0.8)

### Decision Tree: 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = employeeSalaryModel.machinelearning_model('Decision Tree with ajsd', DecisionTreeClassifier, x_train, x_test, y_train, y_test, max_depth=7, min_samples_leaf=3)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets

# Create an instance of decision tree classifier
#
clf = DecisionTreeClassifier(random_state=123)
#
# Create grid parameters for hyperparameter tuning
#
params =  {
    'min_samples_leaf': [1, 2, 3],
    'max_depth': [1, 2, 3]
}
#
# Create gridsearch instance
#
grid = GridSearchCV(estimator=clf,
                    param_grid=params,
                    cv=10,
                    n_jobs=1,
                    verbose=2)
#
# Fit the model
#
grid.fit(x_train, y_train)
#
# Assess the score
#
grid.best_score_, grid.best_params_

### Random Forest: 

In [None]:
# example of grid searching key hyperparameters for RandomForestClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# define dataset
# define models and parameters
model = RandomForestClassifier()
n_estimators = [10,1000,10000]
max_features = ['sqrt', 'log2']
max_depth = [5,12,27,30]
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = employeeSalaryModel.machinelearning_model('Random Forest', RandomForestClassifier, x_train, x_test, y_train, y_test, n_estimators = 1000, max_features = 'sqrt', max_depth =10, random_state = 10)


In [None]:
employeeSalaryModel.prepare_submission(model, 'Random Forest')

### KNN Classifier:

### Logistic Regression: 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


In [None]:
model = employeeSalaryModel.machinelearning_model('Logistic Regression', LogisticRegression, x_train, x_test, y_train, y_test, C=100, penalty='l2', solver='newton-cg')


In [None]:
employeeSalaryModel.prepare_submission(model, 'Logistic Regression')

In [None]:
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
employeeSalaryModel.prepare_submission(model, 'Decision Tree')

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
# define models and parameters
model = RidgeClassifier()
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# define grid search
grid = dict(alpha=alpha)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# example of grid searching key hyperparametres for SVC
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# define dataset
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

NameError: ignored

In [None]:
# example of grid searching key hyperparameters for RandomForestClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# example of grid searching key hyperparameters for GradientBoosting Classifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
n_estimators = [12,33]
max_depth = [4,8,12] #range(1,100)
max_features= [2,8,6]#range(1,100)
learning_rate=[0.01,0.05,0.1]
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features,learning_rate=learning_rate,max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# example of grid searching key hyperparameters for DecisionTreeClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
# define dataset
# define models and parameters
model = DecisionTreeClassifier()
max_features = range(1,10)
max_depth = range(1,100)
criterion = ['gini', 'entropy']
# define grid search
grid = dict(max_depth=max_depth,max_features=max_features,criterion=criterion)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### XGBoost Classifier:

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)


grid_search.fit(x_train, y_train)

grid_search.best_estimator_



In [None]:
grid_search.best_estimator_


In [None]:
model = employeeSalaryModel.machinelearning_model('XGBoost', XGBClassifier, x_train, x_test, y_train, y_test, max_depth=5, n_estimators=40, nthread=4, seed=42)


In [None]:
employeeSalaryModel.prepare_submission(model, 'XGboost')