# IMPORTED LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler, scale
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

#from sklearn.feature_selection import VarianceThreshold, SelectFromModel, SelectKBest, chi2
#from sklearn.ensemble import ExtraTreesClassifier
#import lightgbm as lgb

plt.rcParams['figure.figsize'] = [50, 60]

#%matplotlib inline
plt.style.use('seaborn-whitegrid')

# READING THE DATA

In [2]:
missing_values = ['0', 'nA', '#NUM!']
training_data = pd.read_csv("data/income-train.csv", na_values=missing_values)
test_data = pd.read_csv("data/income-test.csv", na_values=missing_values)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
rename_cols = {"Crime Level in the City of Employement" : "Crime Level",
              "Work Experience in Current Job [years]" : "Work Experience",
              "Body Height [cm]" : "Body Height",
              "Yearly Income in addition to Salary (e.g. Rental Income)" : "Side Income",
              "Total Yearly Income [EUR]" : "Total Income"}

training_data.rename(columns=rename_cols, inplace=True)
test_data.rename(columns=rename_cols, inplace=True)

# Training Data Pre-processing

## - Dropping/Changing Rows/Columns

In [4]:
training_data.drop(columns=['Instance'], inplace=True)

In [5]:
training_data.drop_duplicates(inplace=True)

In [6]:
training_data.dropna(subset=['Year of Record'], inplace=True)

In [7]:
# these are the outliers: the rows that don't go well with the graph of 'Year of Record' x 'Total Income' and 'Housing situation' x 'Total Income'
training_data.drop(index=list(training_data[(training_data['Total Income'] > 500000) & (training_data['Total Income'] < 700000) & (training_data['Year of Record'] < 1995)].index), inplace=True)

In [8]:
training_data['Gender'] = training_data['Gender'].replace('f', 'female')
test_data['Gender'] = test_data['Gender'].replace('f', 'female')

In [9]:
training_data['Side Income'] = training_data['Side Income'].map(lambda x: x.rstrip('\sEUR'))
test_data['Side Income'] = test_data['Side Income'].map(lambda x: x.rstrip('\sEUR'))

## - Dealing with NaN values

In [10]:
column_NaN_values = {"Housing Situation" : "shared_place",
                    "Crime Level" : 0,
                    "Work Experience" : "0",
                    "Satisfation with employer" : "Somewhat Unhappy",
                    "Gender" : "unknown",
                    "Profession" : "no job",
                    "University Degree" : "No",
                    "Wears Glasses" : 0,
                    "Hair Color" : "Bold",
                    "Country" : "No"}
                    #"Year of Record" : 0}

for col in training_data.dtypes.index.tolist():
    if col in column_NaN_values.keys():
        training_data[col] = training_data[col].fillna(value=column_NaN_values[col])
        test_data[col] = test_data[col].fillna(value=column_NaN_values[col])

test_data['Year of Record'] = test_data['Year of Record'].fillna(value=0)

## - Handling of Column Data Types

In [11]:
training_data['Work Experience'] = pd.to_numeric(training_data['Work Experience']).astype(int)
test_data['Work Experience'] = pd.to_numeric(test_data['Work Experience']).astype(int)

In [12]:
training_data['Side Income'] = pd.to_numeric(training_data['Side Income'])
test_data['Side Income'] = pd.to_numeric(test_data['Side Income'])

## - Encoding Categorical Features

In [13]:
for col in training_data.dtypes[training_data.dtypes == 'object'].index.tolist():
    le = LabelEncoder()
    le.fit(training_data[col].unique())
    training_data[col] = le.transform(training_data[col])
    
for col in test_data.dtypes[test_data.dtypes == 'object'].index.tolist():
    le = LabelEncoder()
    le.fit(test_data[col].unique())
    test_data[col] = le.transform(test_data[col])

## - Graphs

In [None]:
#sns.catplot(x="Satisfation with employer", y="Total Income", data=training_data[["Satisfation with employer", "Total Income"]])

In [None]:
#sns.countplot(training_data['Work Experience'])

In [None]:
#sns.relplot(x="Work Experience", y="Total Income", data=training_data[["Work Experience", "Total Income"]])

In [None]:
#training_data['Crime Level'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Crime Level'])

In [None]:
#sns.relplot(x="Crime Level", y="Total Income", data=training_data[["Crime Level", "Total Income"]]);

In [None]:
#training_data['Housing Situation'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Housing Situation'])

In [None]:
#sns.catplot(x="Housing Situation", y="Total Income", data=training_data[["Housing Situation", "Total Income"]])

In [None]:
#sns.countplot(training_data['Year of Record'])

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=training_data[["Year of Record", "Total Income"]]);

## - Building the Model

In [14]:
y = training_data['Total Income']
#x = training_data[['Year of Record', 'Work Experience', 'Age', 'Side Income']]
x = training_data.drop(columns=['Total Income'])
X_train,X_test,y_train,y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [15]:
X_train = scale(X_train)
X_test = scale(X_test)
X_for_pred = scale(test_data.drop(columns=['Total Income', 'Instance']))

In [16]:
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [17]:
mean_absolute_error(y_test, y_pred)

51545.20254412922

In [18]:
submission = clf.predict(X_for_pred)

In [19]:
sub_df = pd.DataFrame({'Instance': test_data['Instance'],
                       'Total Yearly Income [EUR]': submission})

In [20]:
sub_df.to_csv("tcd-ml-1920-group-income-submission.csv",index=False)

## - Side Work

In [None]:
#training_data.dropna(subset=['Year of Record', 'Country'], inplace=True) #******

In [None]:
training_data.corr()

In [None]:
sns.relplot(x="Crime Level", y="Total Income", data=training_data[["Crime Level", "Total Income"]])

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=pd.DataFrame([X_train["Year of Record"], y]))

In [None]:
regression = LinearRegression().fit(X_train, y_train)

In [None]:
regression.score(X_train, y_train)

In [None]:
pred = regression.predict(X_test)
#print("Root Mean squared Error")
#print((sqrt(mean_squared_error(np.exp(Y_test), np.exp(pred)))))

In [None]:
pred

In [None]:
#poly = PolynomialFeatures(degree=15)
#poly.fit_transform(X_train)

In [None]:

X = [[0.44, 0.68], [0.99, 0.23]]
vector = [109.85, 155.72]
predict= [[0.49, 0.18]]
#Edit: added second square bracket above to fix the ValueError problem

poly = PolynomialFeatures(degree=2)
X_ = poly.fit_transform(X)
predict_ = poly.fit_transform(predict)

clf = linear_model.LinearRegression()
clf.fit(X_, vector)
print clf.predict(predict_)

In [None]:
tt = training_data.drop(columns=['Total Income'])
st = scale(tt)


In [None]:
#st.head()

In [None]:
t = training_data.drop(columns=['Total Income'])
names = t.columns
#scaled_t = scale(t)

scaler = StandardScaler()
scaled_t = scaler.fit_transform(t)
scaled_t = pd.DataFrame(scaled_t, columns=names)


#scaler = preprocessing.StandardScaler()# Fit your data on the scaler object
#scaled_df = scaler.fit_transform(df)
#scaled_df = pd.DataFrame(scaled_df, columns=names)

In [None]:
#scaled_t['Year of Record'].mean(axis=1)

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=[scaled_t["Crime Level"], training_data['Total Income']])

In [None]:
#X_train = pd.DataFrame(poly.fit_transform(X_train), columns=X_train.columns)
#poly.fit_transform(X_train)
#X_train.head()

In [None]:
#sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
#pd.DataFrame(sel.fit_transform(X_train))

In [None]:
# Below is the model for polynomial regression with degree = 2

In [None]:
poly = PolynomialFeatures(degree=2)

In [None]:
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

X_for_pred = poly.fit_transform(test_data.drop(columns=['Total Income', 'Instance']))

In [None]:
regression = LinearRegression().fit(X_train, y_train)
regression.score(X_train, y_train)

In [None]:
y_pred = regression.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
submission = regression.predict(X_for_pred)

In [None]:
sub_df = pd.DataFrame({'Instance': test_data['Instance'],
                       'Total Yearly Income [EUR]': submission})

In [None]:
sub_df.to_csv("tcd-ml-1920-group-income-submission.csv",index=False)