## Importation des librairies

In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 

### data importation 

In [73]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Data cleaning

In [44]:
## Deletion of columns 'fnlwgt' and 'educational-num'
data = data.drop(columns=['fnlwgt','educational-num'], axis = 1) 

col_names = data.columns 
### Fill Nan values 
for c in col_names: 
    data = data.replace("?", np.NaN) 
data = data.apply(lambda x:x.fillna(x.value_counts().index[0])) 

## Encoding with labelEncoder

In [32]:
#Replacing columns names
data.replace(['Divorced', 'Married-AF-spouse',  
              'Married-civ-spouse', 'Married-spouse-absent',  
              'Never-married', 'Separated', 'Widowed'], 
             ['divorced', 'married', 'married', 'married', 
              'not_married', 'not_married', 'not_married'], inplace = True) 
## List of category columns 
category_col =['workclass', 'race', 'education', 'marital-status', 'occupation', 
               'relationship', 'gender', 'native-country', 'income']  
# Encoding of category columns
labelEncoder = LabelEncoder() 
  
mapping_dict ={} 
for col in category_col: 
    data[col] = labelEncoder.fit_transform(data[col]) 
  
    le_name_mapping = dict(zip(labelEncoder.classes_, 
                        labelEncoder.transform(labelEncoder.classes_))) 
  
    mapping_dict[col]= le_name_mapping 
print(mapping_dict) 

{'workclass': {' ?': 0, ' Federal-gov': 1, ' Local-gov': 2, ' Never-worked': 3, ' Private': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' State-gov': 7, ' Without-pay': 8}, 'race': {' Amer-Indian-Eskimo': 0, ' Asian-Pac-Islander': 1, ' Black': 2, ' Other': 3, ' White': 4}, 'education': {' 10th': 0, ' 11th': 1, ' 12th': 2, ' 1st-4th': 3, ' 5th-6th': 4, ' 7th-8th': 5, ' 9th': 6, ' Assoc-acdm': 7, ' Assoc-voc': 8, ' Bachelors': 9, ' Doctorate': 10, ' HS-grad': 11, ' Masters': 12, ' Preschool': 13, ' Prof-school': 14, ' Some-college': 15}, 'marital-status': {' Divorced': 0, ' Married-AF-spouse': 1, ' Married-civ-spouse': 2, ' Married-spouse-absent': 3, ' Never-married': 4, ' Separated': 5, ' Widowed': 6}, 'occupation': {' ?': 0, ' Adm-clerical': 1, ' Armed-Forces': 2, ' Craft-repair': 3, ' Exec-managerial': 4, ' Farming-fishing': 5, ' Handlers-cleaners': 6, ' Machine-op-inspct': 7, ' Other-service': 8, ' Priv-house-serv': 9, ' Prof-specialty': 10, ' Protective-serv': 11, ' Sales': 12, '

In [33]:
data

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,9,4,1,1,4,1,2174,0,40,39,0
1,50,6,9,2,4,0,4,1,0,0,13,39,0
2,38,4,11,0,6,1,4,1,0,0,40,39,0
3,53,4,1,2,6,0,2,1,0,0,40,39,0
4,28,4,9,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,7,2,13,5,4,0,0,0,38,39,0
32557,40,4,11,2,7,0,4,1,0,0,40,39,1
32558,58,4,11,6,1,4,4,0,0,0,40,39,0
32559,22,4,11,4,1,3,4,1,0,0,20,39,0


In [34]:
X= data.iloc[:,:12] # Selection of features
Y = data['income'] # Selection of target

##  Spliting data to train and test

In [75]:
# Split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100) 

#### Train model with RandomForest

In [78]:
clf= RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)
ypred = clf.predict(X_test)
print("Random forest accuracy\'s is ", 
             accuracy_score(y_test, ypred)*100 )

Random forest accuracy's is  85.03429214863343


In [79]:
import pickle
pickle.dump(clf, open("RanForestModel.pkl","wb"))