In [1]:
import sys
sys.path.insert(0 , '/home/arpit/study/ML/github')

In [2]:
import mglearn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
#One hot encoding is the way to represent categorical features
#The idea behind dummy variable is to replace the feature with one or more new features that can have the values 0 and 1
#We can represent any number of categories by introducing one new feature per category
#The one hot encoding that we are using is very similar to the dummy encoding used in statistics
#In dummy encoding, it is common to encode a categorical feature with k different possible values into k-1 features
#The last one is represented as all zeros. This is done to simplify the analysis
#You can convert your data into one hot encoding , using either pandas or scikit learn
#Here we are going to use pandas


In [4]:
data = pd.read_csv("/home/arpit/study/ML/github/Intro_to_machine_learning_with_python/data/adult.data" , 
                  header = None ,
                  names = ['age' , 'workclass' , 'fnlwgt' , 'education' , 'education-num' , 
                          'merital-status' , 'occupation' , 'relationship' , 'race' , 'gender' , 'capital-gain' , 
                          'capital-loss' , 'hours-per-week' , 'native-country' , 'income'])
#Header is none because we do not have any header 
#So we are going to put out the names with the help of names attribute


In [5]:
#For the purpose of illustration we are going to select only some of the columns
data = data[['age' , 'workclass' , 'education' , 'gender' , 'hours-per-week' , 'occupation' , 'income']]

In [6]:
#printing the first five columns
data.head(5)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [7]:
#seeing the values of the gender column
#value_counts() is the function used for pandas series , to see what are the unique values in the column and how often 
#they appear

data.gender.value_counts()
#This shows that there are exactly two values, Male and Female

 Male      21790
 Female    10771
Name: gender, dtype: int64

In [8]:
data.workclass.value_counts() ##this is also in a very good shape

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [9]:
data.education.value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64

In [10]:
data.occupation.value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [11]:
data.income.value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [12]:
#all this shows that the data is already in a very good shape
print("Original feature : \n" , list(data.columns))
data_dummies = pd.get_dummies(data)             #this is a function valid for columns that have object type like string
                                               #or the columns that are categorical
print("Feature after get dummies \n" , list(data_dummies.columns))


Original feature : 
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']
Feature after get dummies 
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-ins

In [13]:
data_dummies.head()
#hours features is not touched while all other categories are encoded

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [14]:
print("original shape : " , data.shape)
print("After encode shape of data : " , data_dummies.shape)
#You can see that there are now 46 features

original shape :  (32561, 7)
After encode shape of data :  (32561, 46)


In [20]:
#you must note that the column indexing in pandar include both the ends
#now we are going to copy all the columns from "age" to "occupation_Transport-moving" to feature X and the rest is copied in 
#the target variable
features = data_dummies.ix[: , 'age' : 'occupation_ Transport-moving']
X = features.values               #to convert it into the numpy array
y = data_dummies['income_ >50K'].values
print("Shape of X : " , X.shape)
print("Shape of y : " , y.shape)

Shape of X :  (32561, 44)
Shape of y :  (32561,)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [21]:
#Now we are ready with the data 
#now we can work in the usual way
from sklearn.model_selection import train_test_split 
X_train , X_test , y_train , y_test = train_test_split(X , y)

In [22]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train , y_train)
print("Training accuracy : " , logreg.score(X_train , y_train))
print("Test accuracy : " , logreg.score(X_test , y_test))

Training accuracy :  0.8141687141687142
Test accuracy :  0.8078860090897924


In [23]:
#categorical features are often encoded as numbers, but it is better to understand them as categorical instead of coninuous
#There are two ways to convery number features into one hot encoding
#use scikit's learn OneHotEncoder , for which you can specify which features are continuous which are discrete
#Or use numeric columns in the DataFrame to strings
#Here is the demo of the following
demo_df = pd.DataFrame({'Integer feature' : [0 , 1, 2 , 1] , 
                       'Categorical feature' : ['socks' , 'fox' , 'socks' , 'fox']})
demo_df

Unnamed: 0,Integer feature,Categorical feature
0,0,socks
1,1,fox
2,2,socks
3,1,fox


In [25]:
#using the get_dummies will only convert the categorical features
pd.get_dummies(demo_df)
#You can it has not touched integer features


Unnamed: 0,Integer feature,Categorical feature_fox,Categorical feature_socks
0,0,0,1
1,1,1,0
2,2,0,1
3,1,1,0


In [33]:
demo_df['Integer feature'] = demo_df['Integer feature'].astype(str)
pd.get_dummies(demo_df)                   #since we have changed the type of the integer feature , now it will be treated categorical

Unnamed: 0,Integer feature_0,Integer feature_1,Integer feature_2,Categorical feature_fox,Categorical feature_socks
0,1,0,0,0,1
1,0,1,0,1,0
2,0,0,1,0,1
3,0,1,0,1,0
