In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from catboost import CatBoostRegressor

import main

In [43]:
## Entry Cell of ML Pipeline

FileName = "../data/tcd ml 2019-20 income prediction training (with labels).csv"
NonNumericalColumns = ['Country','Profession','Gender','University Degree']

DataFrame = pd.DataFrame()
DataFrame = ReadInData(FileName)

PrintOutMissingData(DataFrame)

DataFrame = main.HandleMissingData(DataFrame)
PrintOutMissingData(DataFrame)
DataFrame = ScaleTheData(DataFrame, NonNumericalColumns)
DataFrame = CreateDummyColumns(DataFrame, NonNumericalColumns)

print(DataFrame)

|---READING IN DATA---|
Shape of Data: (111993, 12)
            Instance  Year of Record            Age  Size of City  \
count  111993.000000   111552.000000  111499.000000  1.119930e+05   
mean    55997.000000     1999.421274      37.345304  8.388538e+05   
std     32329.738686       11.576382      16.036694  2.196879e+06   
min         1.000000     1980.000000      14.000000  7.700000e+01   
25%     27999.000000     1989.000000      24.000000  7.273400e+04   
50%     55997.000000     1999.000000      35.000000  5.060920e+05   
75%     83995.000000     2009.000000      48.000000  1.184501e+06   
max    111993.000000     2019.000000     115.000000  4.999251e+07   

       Wears Glasses  Body Height [cm]  Income in EUR  
count  111993.000000     111993.000000   1.119930e+05  
mean        0.500531        175.220192   1.092138e+05  
std         0.500002         19.913889   1.498024e+05  
min         0.000000         94.000000  -5.696906e+03  
25%         0.000000        160.000000   3.077

## Preprocessing of Data

In [22]:
## Start of Notebook
def ReadInData(filename):
    main.PrintStatus('READING IN DATA')
    RawData = main.ReadInData(filename)
    print('Shape of Data: '+ str(RawData.shape))
    print(RawData.describe())
    return RawData


In [26]:
def PrintOutMissingData(RawData):
    main.PrintStatus('NULL VALUES FOUND')
    null_counts = RawData.isnull().sum()
    print("Number of null values in each column:\n{}".format(null_counts))

In [8]:
## Dropping Features that are missing a large amount of data
## Dropping instances with missing data

# RawData = main.HandleMissingData(RawData)

In [37]:
## Scale data
def ScaleTheData(RawData, NonNumericalColumns):
    main.PrintStatus('SCALING FEATURES')
    NumericalDataOnly= RawData.drop(columns=NonNumericalColumns, axis=1)
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(NumericalDataOnly)
    rescaledData = scaler.transform(NumericalDataOnly)

    ## Holded scaled for later.
    rescaled_df = pd.DataFrame(rescaledData)
    print(rescaled_df)

    RawData['Year of Record'] = rescaled_df[0]
    RawData['Age'] = rescaled_df[1]
    RawData['Size of City'] = rescaled_df[2]
    RawData['Body Height [cm]'] = rescaled_df[4]

    return RawData

In [None]:
## Removing Outliers. Outliers being top and bottoms 2.5%
## Did not use in the final solution*

from scipy import stats
import numpy as np

z = np.abs(stats.zscore(RawData['Income in EUR']))
print(z)
print(z.shape)
count = []
index = []
for i in range(len(z)):
    if z[i] > 3:
        count.append(z[i])
        index.append(i)
        
RawData = RawData.drop(index)
print("Removing " + str(len(index)))
print("New Number of Instances: " + str(RawData.shape))

In [None]:
## Seeing the data and the unique values

for s in RawData.columns.values:
    print("Features in " + s)
    print(RawData[s].unique())
    print(len(RawData[s].unique()))


In [41]:
## Section to conver to binary matrix
## Changing Country, Profession
## Creating clean training data
def CreateDummyColumns(RawData, columns):
    main.PrintStatus('CREATING DUMMIES')
    RawData = pd.get_dummies(RawData, columns = ['Country', 'Profession', 'Gender', 'University Degree'])
    return RawData

In [None]:
RawData.describe()

In [44]:
DataFrame.head(10)

Unnamed: 0,Year of Record,Age,Size of City,Wears Glasses,Body Height [cm],Income in EUR,Country_Afghanistan,Country_Albania,Country_Algeria,Country_Angola,...,Profession_youth initiatives lead advisor,Gender_0,Gender_female,Gender_male,Gender_other,Gender_unknown,University Degree_Bachelor,University Degree_Master,University Degree_No,University Degree_PhD
0,0.435897,0.267327,0.024801,0,0.578947,61031.94416,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,0.410256,0.267327,0.032073,0,0.538012,91001.32764,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0.974359,0.138614,0.025963,1,0.444444,157982.1767,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.666667,0.188119,0.015039,1,0.450292,45993.75793,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.769231,0.316832,0.001907,0,0.549708,38022.16217,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,0.282051,0.455446,0.00112,0,0.508772,125809.9903,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
6,0.051282,0.564356,0.024106,0,0.467836,150319.4226,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
7,0.717949,0.217822,0.000627,0,0.561404,11849.02876,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
8,0.897436,0.287129,0.001705,0,0.555556,48058.20607,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
9,1.0,0.019802,0.02651,1,0.368421,90940.68912,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [None]:
## Output Training Data to new file
RawData.to_csv("../data/processedData_catboost.csv", index=False)

## Training

In [None]:
## Read in new data and model

CleanData = main.ReadInData("../data/processedData_catboost.csv")

In [None]:
CleanData.columns.unique

In [None]:
income = CleanData['Income in EUR']
# features = CleanData.drop(labels=['Income in EUR', 'Wears Glasses', 'Body Height [cm]'], axis=1)
features = CleanData.drop(labels=['Income in EUR'], axis=1)

features.head(5)

In [None]:
## Dropped columns that had less than 5 occurances. 
## Did not use in the final solution**
count = 0
columnsToDrop = []
for i in range(5,features.shape[1] - 1):
    seriesAt = features.iloc[:,i].value_counts()
    posForSeries = seriesAt[1]
    if posForSeries < 5:
#         print(seriesAt.name)
        columnsToDrop.append(seriesAt.name)
        count = count + 1
        
print(len(columnsToDrop))

In [None]:
features = features.drop(columns=columnsToDrop)
print(features.head(5))

In [None]:
## Splitting Data

features_train, features_val, income_train, income_val = train_test_split(features, income, test_size=0.2, random_state=0)


In [None]:
## Using Linear Regression

model = LinearRegression()
model.fit(features_train, income_train)

print(model.coef_)
print(model.intercept_)

## Using Ridge Regression 
rig_model = RidgeCV()
rig_model.fit(features_train, income_train)

print(rig_model.coef_)
print(rig_model.intercept_)

# Using Random Forest Regression
# RFR = RandomForestRegressor(max_depth=200)
# RFR.fit(features_train, income_train)

# Using CatBoost
cat_model = CatBoostRegressor(iterations=20000)
cat_model.fit(features_train, income_train)


## Validating Results

In [None]:
## Predict with the validation data

income_predict = model.predict(features_val)
comparison = pd.DataFrame({'Actual': income_val, 'Predicted': income_predict})

comparison

income_predict_rig = rig_model.predict(features_val)

# income_predict_RFR = RFR.predict(features_val)

income_predict_cat = cat_model.predict(features_val)

In [None]:
## RMSE - LINEAR MODEL
print('Mean Absolute Error:', metrics.mean_absolute_error(income_val, income_predict))  
print('Mean Squared Error:', metrics.mean_squared_error(income_val, income_predict))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(income_val, income_predict)))

## Mean Absolute Error: 43557.00458896904
# Mean Squared Error: 7422065078.367375
# Root Mean Squared Error: 86151.40787223025

## RMSE - RIDGE MODEL
print('Mean Absolute Error:', metrics.mean_absolute_error(income_val, income_predict_rig))  
print('Mean Squared Error:', metrics.mean_squared_error(income_val, income_predict_rig))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(income_val, income_predict_rig)))

# Mean Absolute Error: 42800.374852794565
# Mean Squared Error: 6624261842.145419
# Root Mean Squared Error: 81389.56838652863

## With Gender and University Degree
## STANDARD
# Mean Absolute Error: 42683.66765298456
# Mean Squared Error: 6611260404.6631155
# Root Mean Squared Error: 81309.65751165796

## RIDGE
# Mean Absolute Error: 42466.72095408424
# Mean Squared Error: 6546309166.8984585
# Root Mean Squared Error: 80909.26502507892

## RMSE - RANDOM FOREST MODEL
# print('Mean Absolute Error:', metrics.mean_absolute_error(income_val, income_predict_RFR))  
# print('Mean Squared Error:', metrics.mean_squared_error(income_val, income_predict_RFR))  
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(income_val, income_predict_RFR)))


## RMSE - CATBOOST MODEL
print('Mean Absolute Error:', metrics.mean_absolute_error(income_val, income_predict_cat))  
print('Mean Squared Error:', metrics.mean_squared_error(income_val, income_predict_cat))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(income_val, income_predict_cat)))

## Random Forest Regressor
# Mean Absolute Error: 34068.166948849575
# Mean Squared Error: 6755748786.367559
# Root Mean Squared Error: 82193.36217948236

## On attempt in remove outlier of Country and Professions
# Mean Absolute Error: 42537.31959067379
# Mean Squared Error: 6388705859.717933
# Root Mean Squared Error: 79929.38045373511
# Mean Absolute Error: 42541.744230243545
# Mean Squared Error: 6399195869.625069
# Root Mean Squared Error: 79994.97402727917

# Catboost
# Mean Absolute Error: 31269.4598656221
# Mean Squared Error: 5062659956.418068
# Root Mean Squared Error: 71152.37140403732

In [None]:
## Graphing Difference 

df1 = comparison.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

## Data Exploring 

In [None]:
## Read in Test and try to map
## Cell used to explore difference in Train data and Test data

TestData = main.ReadInData("../data/tcd ml 2019-20 income prediction test (without labels).csv")
TestData.head(5)


In [None]:
## Comparing Training and Test (Country)
print(TestData['Country'].unique())
print("Test Shape " + str(len(TestData['Country'].unique())))
Raw = main.ReadInData("../data/tcd ml 2019-20 income prediction training (with labels).csv")
print(Raw['Country'].unique())
print("Raw Shape " + str(len(Raw['Country'].unique())))

print(TestData['Country'].equals(Raw['Country']))

## Test has less than 3 to Training

In [None]:
## Comparing Training and Test (Profession)
print(TestData['Profession'].unique())
print("Test Shape " + str(len(TestData['Profession'].unique())))
print(Raw['Profession'].unique())
print("Raw Shape " + str(len(Raw['Profession'].unique())))

print(TestData['Profession'].equals(Raw['Profession']))

## Test Data has less 15 to Training

In [None]:
## Comparing Training and Test (University Degree)
print(TestData['University Degree'].unique())
print("Test Shape " + str(len(TestData['University Degree'].unique())))
print(Raw['University Degree'].unique())
print("Raw Shape " + str(len(Raw['University Degree'].unique())))

print(TestData['University Degree'].equals(Raw['University Degree']))


In [None]:
## Cell to Compare what is in Test and Trainning Data

Diff = pd.DataFrame(columns=['Unique_C_InTest', 'Unique_C_InTrain','Unique_P_InTest', 'Unique_P_InTrain'])

Diff = pd.DataFrame()
Diff['Unique_C_InTest'] = TestData["Country"][~TestData["Country"].isin(Raw["Country"])].drop_duplicates()
print(Diff['Unique_C_InTest'])
print(str(len(Diff['Unique_C_InTest'])))

Diff = pd.DataFrame()
Diff['Unique_P_InTest',] = TestData["Profession"][~TestData["Profession"].isin(Raw["Profession"])].drop_duplicates()
print(Diff['Unique_P_InTest',])
print(str(len(Diff['Unique_P_InTest',])))

Diff = pd.DataFrame()
Diff['Unique_C_InTrain'] = Raw["Country"][~Raw["Country"].isin(TestData["Country"])].drop_duplicates()
print(Diff['Unique_C_InTrain'])
print(str(len(Diff['Unique_C_InTrain'])))

Diff = pd.DataFrame()
Diff['Unique_P_InTrain'] = Raw["Profession"][~Raw["Profession"].isin(TestData["Profession"])].drop_duplicates()
print(Diff['Unique_P_InTrain'])
print(str(len(Diff['Unique_P_InTrain'])))


## Prediction of Test Data

In [None]:
## Cell used to test MapColVarToMap
## Main Cell to Clean Test Data

Raw = main.ReadInData("../data/tcd ml 2019-20 income prediction training (with labels).csv")
test = main.ReadInData("../data/tcd ml 2019-20 income prediction test (without labels).csv")
test = main.HandleMissingData(test)
CleanTest = test.drop('Country', axis=1)
CleanTest = CleanTest.drop('Profession', axis=1)
CleanTest = CleanTest.drop('Gender', axis=1)
CleanTest = CleanTest.drop('University Degree', axis=1)
CleanTest = CleanTest.drop('Income', axis=1)

# Profession's NaN must be will with No, to allow function MapColVarToModelInput to sort columns
Raw = main.HandleMissingData(Raw)

## Scale data
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(CleanTest)
rescaledData = scaler.transform(CleanTest)
df = pd.DataFrame(rescaledData)
print(df)
CleanTest['Year of Record'] = df[0]
CleanTest['Age'] = df[1]
CleanTest['Size of City'] = df[2]
CleanTest['Body Height [cm]'] = df[4]


# Create 1st DF as copy. Create 2nd DF with encoding. Cat together

countryDf = main.MapColVarToModelInputs(Raw, test, 'Country')
professionDf = main.MapColVarToModelInputs(Raw, test, 'Profession')
genderDf = main.MapColVarToModelInputs(Raw, test, 'Gender')
uniDf = main.MapColVarToModelInputs(Raw, test, 'University Degree')

print(CleanTest.columns)
TestInputs = pd.concat([CleanTest, countryDf], axis=1)
TestInputs = pd.concat([TestInputs, professionDf], axis=1)
TestInputs = pd.concat([TestInputs, genderDf], axis=1)
TestInputs = pd.concat([TestInputs, uniDf], axis=1)
print(TestInputs.columns)
# TestInputs = TestInputs.drop(columns=columnsToDrop)

In [None]:
print(professionDf.head(5))
print(countryDf.head(5))
print(count)
print(TestInputs.head(5))

In [None]:
TestInputs.to_csv("../data/processedTest_catboost.csv", index=False)

In [None]:
TestInputs = main.ReadInData('../data/processedTest_4.csv')

In [None]:
## Linear Regression

test_prediction = model.predict(TestInputs)
print(test_prediction)

submit = main.ReadInData('../data/tcd ml 2019-20 income prediction submission file.csv')
submit['Income'] = np.round(test_prediction, 3)
submit.head(5)
submit.to_csv('../data/my_predictions_8.csv', index=False)

In [None]:
## Ridge Regression
test_prediction_rig = rig_model.predict(TestInputs)
print(test_prediction_rig)

submit = main.ReadInData('../data/tcd ml 2019-20 income prediction submission file.csv')
submit['Income'] = np.round(test_prediction_rig, 2)
submit.to_csv('../data/my_predictions_7.csv', index=False)

In [None]:
## CatBoost Regression
test_prediction_cat = cat_model.predict(TestInputs)
print(test_prediction_cat)

submit = main.ReadInData('../data/tcd ml 2019-20 income prediction submission file.csv')
submit['Income'] = np.round(test_prediction_cat, 4)
submit.to_csv('../data/my_predictions_10.csv', index=False)

## Exploring Data PT.2

In [None]:
## Visualising

plt.figure(figsize=(16,3))

plt.subplot(131)
plt.scatter(x = Raw['Year of Record'], y = Raw['Income in EUR'])
plt.subplot(132)
plt.scatter(x = Raw['Gender'], y = Raw['Income in EUR'])
plt.subplot(133)
plt.scatter(x = Raw['Age'], y = Raw['Income in EUR'])
# plt.subplot(111)
# plt.scatter(x = Raw['Country'], y = Raw['Income in EUR'])
plt.show()

In [None]:
## More Visualising Cells

print(dropped.shape)
    
plt.figure(figsize=(16,3))

plt.subplot(131)
plt.scatter(x = dropped['Year of Record'], y = dropped['Income in EUR'])
plt.subplot(132)
plt.scatter(x = dropped['Gender'], y = dropped['Income in EUR'])
plt.subplot(133)
plt.scatter(x = dropped['Age'], y = dropped['Income in EUR'])
# plt.subplot(111)
# plt.scatter(x = Raw['Country'], y = Raw['Income in EUR'])
plt.show()

In [None]:
## Playing around with Data Cell.

Raw = main.ReadInData("../data/tcd ml 2019-20 income prediction training (with labels).csv")
Raw = main.HandleMissingData(Raw)
Raw['University Degree'] = Raw['University Degree'].replace(['0'], 'No')

sub_df = Raw[Raw['Profession'].str.contains("senior")]
sub_df.head(5)

In [None]:
print(sub_df.describe())


plt.scatter(x = sub_df['Year of Record'], y = sub_df['Income in EUR'])
plt.show()