In [1]:
%config IPCompleter.greedy=True
import time
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

# Load Data

In [2]:
train = pd.read_csv("tcd ml 2019-20 income prediction training (with labels).csv", index_col = 0)
train = train[['Country', 'Profession', 'Year of Record', 'Age', 'Body Height [cm]','Income in EUR', 'Size of City','University Degree']]
Y = train['Income in EUR']
test_set = pd.read_csv("tcd ml 2019-20 income prediction test (without labels).csv", index_col = 0)
test_set = test_set[['Country', 'Profession', 'Year of Record', 'Age', 'Body Height [cm]','Income', 'Size of City','University Degree']]

In [3]:
test_set.head()

Unnamed: 0_level_0,Country,Profession,Year of Record,Age,Body Height [cm],Income,Size of City,University Degree
Instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
111994,Honduras,senior project analyst,1992.0,21.0,153,,391652,Master
111995,Kyrgyzstan,greeter,1986.0,34.0,163,,33653,Bachelor
111996,Portugal,liaison,1994.0,53.0,153,,34765,Bachelor
111997,Uruguay,occupational therapist,1984.0,29.0,154,,1494132,No
111998,Serbia,portfolio manager,2007.0,17.0,191,,120661,No


# Split the Data

In [4]:
train["train"] = 1
test_set["train"] = 0

In [5]:
df = pd.concat([train,test_set], sort=True)

In [6]:
df = df.rename(columns = {'Body Height [cm]' : 'Body Height cm'})

# Data Cleaning

In [7]:
col = df.columns
df[col] = df[col].fillna(df.mean().iloc[0])

In [8]:
df.isnull().sum()

Age                  0
Body Height cm       0
Country              0
Income               0
Income in EUR        0
Profession           0
Size of City         0
University Degree    0
Year of Record       0
train                0
dtype: int64

In [9]:
df_dup = df

In [10]:
df1 = df.groupby('Profession')['Income in EUR'].agg('mean')
df1 = df1.to_dict()
df2 = df.groupby('Country')['Income in EUR'].agg('mean')
df2 = df2.to_dict()
df3 = df.groupby('University Degree')['Income in EUR'].agg('mean')
df3 = df3.to_dict()

In [11]:
df.Profession = df.Profession.replace(df1)
df.Country = df.Country.replace(df2)
df['University Degree']= df['University Degree'].replace(df3)

In [12]:
df.head()

Unnamed: 0_level_0,Age,Body Height cm,Country,Income,Income in EUR,Profession,Size of City,University Degree,Year of Record,train
Instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,41.0,193,19901.93856,37.301599,61031.94416,90894.61362,1239930,66171.973738,1997.0,1
2,41.0,186,33970.831049,37.301599,91001.32764,87476.563667,1603504,70029.467371,1996.0,1
3,28.0,170,52065.951148,37.301599,157982.1767,67844.140129,1298017,78634.418796,2018.0,1
4,33.0,171,60790.387872,37.301599,45993.75793,31511.817627,751903,60914.817625,2006.0,1
5,46.0,188,26056.481036,37.301599,38022.16217,54399.130301,95389,86807.478434,2010.0,1


# One-Hot Encoding

In [13]:
# df_2 = pd.get_dummies(df, columns = ["Gender","University Degree","Hair Color"])

In [14]:
# df_2.shape

In [15]:
# new_train = df_2[df_2['train'] == 1]
# new_test = df_2[df_2['train'] == 0]

new_train = df[df['train'] == 1]
new_test = df[df['train'] == 0]

In [16]:
new_train.drop(['Income','train'],axis=1,inplace=True)
new_test.drop(['Income in EUR','train'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
new_train.head()

Unnamed: 0_level_0,Age,Body Height cm,Country,Income in EUR,Profession,Size of City,University Degree,Year of Record
Instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,41.0,193,19901.93856,61031.94416,90894.61362,1239930,66171.973738,1997.0
2,41.0,186,33970.831049,91001.32764,87476.563667,1603504,70029.467371,1996.0
3,28.0,170,52065.951148,157982.1767,67844.140129,1298017,78634.418796,2018.0
4,33.0,171,60790.387872,45993.75793,31511.817627,751903,60914.817625,2006.0
5,46.0,188,26056.481036,38022.16217,54399.130301,95389,86807.478434,2010.0


In [18]:
new_test.head()

Unnamed: 0_level_0,Age,Body Height cm,Country,Income,Profession,Size of City,University Degree,Year of Record
Instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
111994,21.0,153,24919.838128,37.301599,93104.97046,391652,70029.467371,1992.0
111995,34.0,163,28024.190053,37.301599,38587.367753,33653,66171.973738,1986.0
111996,53.0,153,37416.166095,37.301599,41331.420448,34765,66171.973738,1994.0
111997,29.0,154,143768.294289,37.301599,48496.377648,1494132,60914.817625,1984.0
111998,17.0,191,16211.202348,37.301599,75416.825245,120661,60914.817625,2007.0


In [19]:
new_test.drop(['Income'],axis=1,inplace=True)

In [20]:
print(new_test.shape)
print(new_train.shape)

(73230, 7)
(111993, 8)


In [21]:
Xtrain = new_train.drop('Income in EUR',1) # this is data
Ytrain = new_train['Income in EUR'] #This is label

In [22]:
print(Xtrain.shape)
print(Ytrain.shape)

(111993, 7)
(111993,)


In [23]:
# def is_missing(df, columns):
#     missing_value = {}
#     print('Number of Missing at each column')
#     length_df = len(df)
#     for i in columns:
#         total_value = df[i].value_counts().sum()
#         missing_value[i] = length_df - total_val 
#     print(missing_value)

In [24]:
Xstdscaler = StandardScaler()
stdx = Xstdscaler.fit_transform(Xtrain)

Test_stdscaler = StandardScaler()
std_test = Xstdscaler.fit_transform(new_test)

In [25]:
scaled_Xtrain = pd.DataFrame(stdx, columns = Xtrain.columns)
scaled_newtest = pd.DataFrame(std_test, columns = new_test.columns)

In [26]:
# scaled_Xtrain.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in scaled_Xtrain.columns.values]

In [27]:
# scaled_newtest.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in scaled_newtest.columns.values]

# Random Forest

In [28]:
# %%time
# Random_Forest = RandomForestRegressor(n_estimators = 1000, random_state = 0)
# RF_Model = Random_Forest.fit(scaled_Xtrain,Ytrain)
# Accu = (RF_Model.score(scaled_Xtrain,Ytrain))

# KNN

In [None]:
# %%time
# knn = KNeighborsClassifier()
# knn.fit(scaled_Xtrain,training_scores_encoded)
# Y_Pred = knn.predict(scaled_newtest)
#Accu = (knn.score(scaled_Xtrain,Ytrain))

# Label Encoder

In [29]:
# from sklearn import preprocessing
# from sklearn import utils
# lab_enc = preprocessing.LabelEncoder()
# training_scores_encoded = lab_enc.fit_transform(Ytrain)
# print(training_scores_encoded)
# print(utils.multiclass.type_of_target(Ytrain))
# print(utils.multiclass.type_of_target(Ytrain.astype('int')))
# print(utils.multiclass.type_of_target(training_scores_encoded))

# Final Pre-processing

In [30]:
from sklearn.preprocessing import PolynomialFeatures

poly1 = PolynomialFeatures()
Xtrain = poly1.fit_transform(Xtrain)

poly2 = PolynomialFeatures()
new_test = poly2.fit_transform(new_test)

# XGBoost

In [35]:
%%time
xg_reg = xgb.XGBRegressor(objective ='reg:linear', learning_rate = 0.11, random_state = 82, max_depth = 6, alpha = 5, subsample=0.7, colsample_bytree = 1)
model = xg_reg.fit(Xtrain,Ytrain)


CPU times: user 53.9 s, sys: 142 ms, total: 54 s
Wall time: 54.1 s


In [37]:
prediction = model.predict(new_test)

# Submission

In [38]:
Submit = pd.DataFrame()
Submit['Instance'] = test_set.index
Submit['Income'] = prediction
Submit.to_csv('sub6.csv',index=False)