In [573]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from math import sqrt, log, exp
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline 

In [574]:
census_train_data= pd.read_csv('census_train.csv')
census_test_data = pd.read_csv('census_test.csv')

# Adding Headers to Data
census_train_data.columns = ["idnum", "age", "workerclass", "interestincome", "traveltimetowork",\
                             "vehicleoccupancy", "meansoftransport", "marital", "schoolenrollment",\
                             "educationalattain", "sex", "workarrivaltime", "hoursworkperweek", "ancestry",\
                             "degreefield", "industryworkedin", "wages"]

census_test_data.columns = ["idnum", "age", "workerclass", "interestincome", "traveltimetowork",\
                             "vehicleoccupancy", "meansoftransport", "marital", "schoolenrollment",\
                             "educationalattain", "sex", "workarrivaltime", "hoursworkperweek", "ancestry",\
                             "degreefield", "industryworkedin"]
# adding SalePrice column to test dataset for consistency
census_test_data['wages'] = 0
frames = [census_train_data, census_test_data]
df = pd.concat(frames)
df.shape

(1682, 17)

In [575]:
def cleaning(data):
    # Cleaning workerclass -  Filling it with zero's (New Category) as the description represents a different class from others
    data['workerclass'] = data['workerclass'].replace('?', 0)
    
    # Cleaning traveltimetowork - Filling it with zero's (New Category) as worker who works from home will never travel, 
    # Same applies to a worker who never worked.
    data['traveltimetowork'] = data['traveltimetowork'].replace('?', 0)

    # ******Cleaning vehicleoccupancy - Filling it with zero's (New Category) for now*******
    data['vehicleoccupancy'] = data['vehicleoccupancy'].replace('?', 0)

    # Cleaning meansoftransport - Filling it with zero's (New Category) as non-worked is assumed that he/she will not travel to work
    data['meansoftransport'] = data['meansoftransport'].replace('?', 0)

    # Cleaning schoolenrollment - Filling it with zero's (New Category) as the least age for enrollment is atleast 5 Years
    # 0 - No, less than 3 years
    data['schoolenrollment'] = data['schoolenrollment'].replace('?', 0)

    # Cleaning educationalattain - Filling it with zero's (New Category) as the least age for enrollment is atleast 5 Years
    # 1 - No Education Attainment, less than 3 years
    data['educationalattain'] = data['educationalattain'].replace('?', 1)

    # Cleaning meansoftransport - Filling it with zero's (New Category) as non-worked is assumed that he/she will not travel to work
    data['workarrivaltime'] = data['workarrivaltime'].replace('?', 0)

    # Cleaning hoursworkperweek - Filling it with zero's (New Category) for non-worker
    data['hoursworkperweek'] = data['hoursworkperweek'].replace('?', 0)

    # Cleaning degreefield - Filling it with 1111 (New Category) for unknown degree
    # 1000, less than bachelor's degree (Related to educationattain)
    data['degreefield'] = data['degreefield'].replace('?', 1000)

    # Cleaning industryworkedin - Filling it with 1111 (New Category) for unknown degree
    # 100, less than 16 years old/not in labor force who last worked more than 5 years ago or never worked
    data['industryworkedin'] = data['industryworkedin'].replace('?', 100)
        
    return data

In [576]:
def log_rmse(model, X, y):
    # Fit model on X & y inp     
    model.fit(X,y)
    prediction = model.predict(X)
    
    return sqrt(mean_squared_error((y), prediction))

def generate_submission(test,model,submission_name):
    test_prediction = model.predict(test)
    test_pred = pd.DataFrame(abs(np.exp(test_prediction)))
    ID = pd.DataFrame(test['Id'])
    pred = ID.join(test_pred)
    submission_file = submission_name + '.csv'
    pred.to_csv(submission_file, header=['Id','Salary'],index=False)

In [577]:
def data_drop(data):
    data = data.drop(['traveltimetowork', 'ancestry'],axis=1)
    return data

In [578]:
def workarrivaltimebins(data):
    count=1
    final=0
    for x in range(len(data.workarrivaltime)):
        if(x%20 == 0):
            data['workarrivaltime'][x-20:x+1] = final
    #         c=c+1
    #         f=f+1
            final += 1
    data['workarrivaltime'].unique()
    return data

In [579]:
clean_data = cleaning(df)
clean_data.head(5)

Unnamed: 0,idnum,age,workerclass,interestincome,traveltimetowork,vehicleoccupancy,meansoftransport,marital,schoolenrollment,educationalattain,sex,workarrivaltime,hoursworkperweek,ancestry,degreefield,industryworkedin,wages
0,1,34,3,0,10,1,1,5,1,16,1,168,40,51,1000,7860,34500
1,2,57,2,0,15,8,1,5,1,16,1,100,30,51,1000,8390,4700
2,3,17,0,0,0,0,0,5,2,15,2,0,0,714,1000,100,0
3,4,42,1,0,20,1,1,1,1,22,1,92,45,21,5001,6990,120000
4,5,18,0,0,0,0,0,5,2,15,2,0,0,999,1000,100,0


In [580]:
clean_data = workarrivaltimebins(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [581]:
clean_data = data_drop(clean_data)

In [582]:
clean_data

Unnamed: 0,idnum,age,workerclass,interestincome,vehicleoccupancy,meansoftransport,marital,schoolenrollment,educationalattain,sex,workarrivaltime,hoursworkperweek,degreefield,industryworkedin,wages
0,1,34,3,0,1,1,5,1,16,1,1,40,1000,7860,34500
1,2,57,2,0,8,1,5,1,16,1,1,30,1000,8390,4700
2,3,17,0,0,0,0,5,2,15,2,1,0,1000,100,0
3,4,42,1,0,1,1,1,1,22,1,1,45,5001,6990,120000
4,5,18,0,0,0,0,5,2,15,2,1,0,1000,100,0
5,6,67,1,9500,0,0,1,1,21,1,1,3,2405,7390,4000
6,7,21,2,0,0,0,5,3,16,2,1,0,1000,8590,0
7,8,18,1,0,0,12,5,3,19,2,1,10,1000,4970,1100
8,10,43,2,0,0,4,5,1,16,1,1,24,1000,9180,48000
9,11,83,0,0,0,0,4,1,15,1,1,0,1000,100,0


In [583]:
df =  clean_data

In [584]:
# From our combined we get back our train & test datasets
train = df[:1183]
test = df[1183:]

print(train.shape)
print(test.shape)

# Combine train and test so we can continue performing other operations on the whole dataset
frames = [train, test]
df = pd.concat(frames)
print(df.shape)

# Make sure there is no null value in the target feature
np.where(np.isnan(df['wages']))

(1183, 15)
(499, 15)
(1682, 15)


(array([], dtype=int64),)

In [585]:
# Get all numeric features
#numerical_features = ['age', 'interestincome', 'traveltimetowork', 'workarrivaltime', 'hoursworkperweek']
numerical_features = ['age', 'interestincome', 'hoursworkperweek']


scaler = StandardScaler()

# Again, from our combined df we get back our train & test datasets.
train = df[:1183]
test = df[1183:]

train.loc[:,numerical_features] = scaler.fit_transform(train[numerical_features])
test.loc[:,numerical_features] = scaler.transform(test[numerical_features])

# Combine train and test so we can continue performing other operations on the whole dataset
frames = [train, test]
df = pd.concat(frames)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,idnum,age,workerclass,interestincome,vehicleoccupancy,meansoftransport,marital,schoolenrollment,educationalattain,sex,workarrivaltime,hoursworkperweek,degreefield,industryworkedin,wages
0,1,-0.701676,3,-0.123582,1,1,5,1,16,1,1,0.795301,1000,7860,34500
1,2,0.444641,2,-0.123582,8,1,5,1,16,1,1,0.321874,1000,8390,4700
2,3,-1.548954,0,-0.123582,0,0,5,2,15,2,1,-1.098406,1000,100,0
3,4,-0.302957,1,-0.123582,1,1,1,1,22,1,1,1.032014,5001,6990,120000
4,5,-1.499114,0,-0.123582,0,0,5,2,15,2,1,-1.098406,1000,100,0


In [586]:
# Make sure there is no null value in the target feature
np.where(np.isnan(df['wages']))

(array([], dtype=int64),)

In [587]:
#category = ['workerclass', 'vehicleoccupancy',  'meansoftransport', 'marital', 'schoolenrollment', 'educationalattain', 'sex', 'ancestry','degreefield','industryworkedin']
category = ['workerclass', 'vehicleoccupancy',  'meansoftransport', 'marital', 'schoolenrollment', 'educationalattain', 'sex', 'degreefield','industryworkedin']

df_dummies = pd.get_dummies(df[category])
df = df.drop(df[category],axis=1)
df = pd.concat([df, df_dummies],axis=1)

In [588]:
# For the last time, from our combined df we get back our train & test datasets.
train = df[:1183]
test = df[1183:]

test = test.drop(['wages'],axis=1)

# Log transformation of the target feature
train.loc[:,'wagesLog'] = np.log(train['wages'])

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [589]:
X = train.drop(['wages','wagesLog'],axis=1)
X.head()

Unnamed: 0,idnum,age,interestincome,workarrivaltime,hoursworkperweek,marital,schoolenrollment,educationalattain,sex,workerclass_0,...,industryworkedin_9470,industryworkedin_9480,industryworkedin_9490,industryworkedin_9570,industryworkedin_9590,industryworkedin_9670,industryworkedin_9680,industryworkedin_9770,industryworkedin_9870,industryworkedin_9920
0,1,-0.701676,-0.123582,1,0.795301,5,1,16,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.444641,-0.123582,1,0.321874,5,1,16,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,-1.548954,-0.123582,1,-1.098406,5,2,15,2,1,...,0,0,0,0,0,0,0,0,0,0
3,4,-0.302957,-0.123582,1,1.032014,1,1,22,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,-1.499114,-0.123582,1,-1.098406,5,2,15,2,1,...,0,0,0,0,0,0,0,0,0,0


In [590]:
y = train[['wages', 'wagesLog']]

In [591]:
#X.to_csv("data.csv")

In [592]:
X = X.fillna(0)
y = y.replace([np.inf, -np.inf], 0)

In [593]:
from sklearn.linear_model import LinearRegression

scorer = make_scorer(r2_score)
LR_model = LinearRegression()
LR_model.fit(X, y.drop('wages',axis=1))
train_score = LR_model.score(X, y.drop('wages',axis=1))
cv_score = cross_val_score(LR_model, X, y.drop('wages',axis=1),cv=3, scoring = scorer)

In [594]:
train_score

0.938645378135172

In [595]:
cv_score.mean()

-7577775504832.18

In [596]:
# Initialize different regression algorithms
lm = linear_model.LinearRegression()
ridge = linear_model.Ridge()
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor()
KNN = KNeighborsRegressor()

models = [lm, ridge, decision_tree, random_forest,KNN]
model_name = ['lm', 'ridge', 'decision_tree','random_forest','KNN']
scorer = make_scorer(r2_score)
result = {}

for name, model in enumerate(models):
    
    model.fit(X, y.drop('wages',axis=1))
    train_score = model.score(X, y.drop('wages',axis=1))
    cv_score = cross_val_score(model, X, y.drop('wages',axis=1),cv=3, scoring = scorer)
    name = model_name[name]
    result[name] = [train_score,cv_score.mean()]

for model_scores in result:
    print(model_scores, result[model_scores])

  from ipykernel import kernelapp as app
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


lm [0.938645378135172, -7577775504832.18]
ridge [0.9354537218235969, 0.8890768398302065]
decision_tree [1.0, 0.9569637995712276]
random_forest [0.9953663073349158, 0.9700557019320698]
KNN [0.37592061010349, -0.28609262688744147]


  estimator.fit(X_train, y_train, **fit_params)


In [597]:
# Trying Previous Method
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = np.array(train_test_split(X, y, train_size = 0.7))

LR_model = LinearRegression()
LR_model.fit(x_train, y_train.drop('wages',axis=1))
preds = LR_model.predict(x_test)
MSE = mse(y_test.drop('wages',axis=1), preds)
print(MSE)



3.7458253652443854


In [598]:
from sklearn.ensemble import RandomForestRegressor as rf
x_train, x_test, y_train, y_test = np.array(train_test_split(X, y, train_size = 0.7))
    
model = rf()
model.fit(x_train,y_train.drop('wages',axis=1))
preds = model.predict(x_test)

  """


In [599]:
MSE = mse(y_test.drop('wages',axis=1), preds)
print(MSE)

0.7708680386217261


In [600]:
predicted_salary = np.exp(list(preds))-1
predicted_salary = np.round(predicted_salary/100)*100

In [601]:
MSE = mse(y_test.drop('wagesLog',axis=1), predicted_salary)
print(sqrt(MSE))

65421.64240686157


In [564]:
pd.DataFrame(predicted_salary)

Unnamed: 0,0
0,102200.0
1,3200.0
2,0.0
3,0.0
4,128300.0
5,110500.0
6,2600.0
7,28600.0
8,11600.0
9,1100.0


In [491]:
pd.DataFrame(y_test.drop('wagesLog',axis=1))

Unnamed: 0,wages
520,0
985,0
11,5000
870,0
476,18000
867,13300
220,45000
111,0
326,0
655,52000


In [299]:
test_pred = pd.DataFrame(abs(np.exp(preds)))
test_pred = np.round(test_pred/100)*100

In [48]:
y_test_round = pd.DataFrame(abs(np.exp(y_test)))
y_test_round = np.round(test_pred/100)*100

In [301]:
mse(y_test.drop('wagesLog',axis=1), test_pred)

2769077962.535211

In [151]:
mse(y_test,preds)

2774233372.4409285