In [1]:
import pandas as pd
import matplotlib
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [4]:
def reading_data():
    training_data=pd.read_csv("Income_train.csv")
    testing_data=pd.read_csv("Income_test.csv")
    return training_data,testing_data

In [7]:
def preprocessing(train,test):

    #type checking
    train.dtypes
    #describing the continuous variables
    train.describe()
    #looking the categorical variables
    cv =train.dtypes.loc[train.dtypes == 'object'].index
    #fetching the unique values of cat 
    train[cv].apply(lambda x: len(x.unique()))
    #count of each category of cat var
    train['Race'].value_counts()
    #%age of obs in each category
    train['Race'].value_counts()/train.shape[0]
    train['Native.Country'].value_counts()/train.shape[0]
    #cross tabulation for cat-cat var
    ct = pd.crosstab(train['Sex'],train['Income.Group'],margins = True)
    # print the stacked chart
    #% matplotlib inline
    ct.iloc[:-1,:-1].plot(kind = 'bar', stacked = True, color = ['red','blue'], grid = False)
        #plotting %age
    def percConvert(ser):
        return ser/float(ser[-1])
    ct2 = ct.apply(percConvert,axis = 1)
    ct2.iloc[:-1,:-1].plot(kind = 'bar', stacked = True, color = ['red','blue'], grid = False)
    #cont var
    train.plot('Age','Hours.Per.Week', kind = 'scatter')
    #cat-cont var
    train.boxplot(column = 'Hours.Per.Week', by = 'Sex')

    #missing values
    train.apply(lambda x: sum(x.isnull()))
    test.apply(lambda x: sum(x.isnull()))

    #imputation

    mode(train['Workclass']).mode[0]

    var_to_impute = ['Workclass','Occupation','Native.Country']
    for var in var_to_impute:
        train[var].fillna(mode(train[var]).mode[0], inplace = True)
        test[var].fillna(mode(test[var]).mode[0], inplace = True)

    #outlier treatment
    #%matplotlib inline
    train.plot('ID','Age', kind = 'scatter')
    train.plot('ID','Hours.Per.Week', kind = 'scatter')

    #variable transformation
    train['Workclass'].value_counts()/train.shape[0]

    categories_to_combine = ['State-gov','Self-emp-inc','Federal-gov','Without-pay','Never-worked']

    for cat in categories_to_combine:
        train['Workclass'].replace({cat:'Others'}, inplace = True)
        test['Workclass'].replace({cat:'Others'}, inplace = True)

    #similarly run a loop to combine all categorieso f all cv
    # removing workplace
    cv = cv[1:]
    cv
 
    for column in cv:
    #determine the categories to combine
        frq = train[column].value_counts()/train.shape[0]
        categories_to_combine = frq.loc[frq.values < 0.05].index

        #loop over all categories and combine them as others
        for cat in categories_to_combine:
            train[column].replace({cat:'Others'}, inplace = True)
            test[column].replace({cat:'Others'}, inplace = True)

    return train,test

#############################################Function for Modelling ######################################

def train_model(train,test):

    cat_var1 =train.dtypes.loc[train.dtypes == 'object'].index
    le =LabelEncoder()
    for var in cat_var1:
        train[var] = le.fit_transform(train[var])
    cat_var2 =test.dtypes.loc[test.dtypes == 'object'].index
    for var in cat_var2:    
        test[var] = le.fit_transform(test[var])

    depv = 'Income.Group'
    indepv = [x for x in train.columns if x not in ['ID',depv]]

    model = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features = 'sqrt')

    model.fit(train[indepv],train[depv])

    pred_train = model.predict(train[indepv])
    pred_test = model.predict(test[indepv])
    true_value = train[depv]

    return true_value, pred_train

############################################## Function for Model Evaluation ############################################## 

def model_evaluation(true,predicted):
    print("Train Accuracy {:.2%}".format(accuracy_score(true,predicted)))

##############################################Main Function #######################################################
def main():
    train,test = reading_data()
    preprocessing(train,test)
    actual_train, predicted_train = train_model(train,test)
    model_evaluation(actual_train, predicted_train)


In [8]:
main()

  return getattr(obj, method)(*args, **kwds)


Train Accuracy 81.68%
