# IMPORTED LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder #PolynomialFeatures, StandardScaler, scale
#from sklearn.linear_model import LinearRegression, Ridge

from category_encoders import *
import lightgbm as lgbm
from sklearn.model_selection import train_test_split

plt.rcParams['figure.figsize'] = [50, 60]

#%matplotlib inline
plt.style.use('seaborn-whitegrid')

# READING THE DATA

In [2]:
missing_values = ['0', 'nA', '#NUM!', '#NA', '#N/A']
training_data = pd.read_csv("data/income-train.csv", na_values=missing_values)
test_data = pd.read_csv("data/income-test.csv", na_values=missing_values)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
rename_cols = {"Crime Level in the City of Employement" : "Crime Level",
              "Work Experience in Current Job [years]" : "Work Experience",
              "Body Height [cm]" : "Body Height",
              "Yearly Income in addition to Salary (e.g. Rental Income)" : "Side Income",
              "Total Yearly Income [EUR]" : "Total Income"}

training_data.rename(columns=rename_cols, inplace=True)
test_data.rename(columns=rename_cols, inplace=True)

In [18]:
training_data.head()

Unnamed: 0,Year of Record,Housing Situation,Work Experience,Satisfation with employer,Gender,Age,Country,Size of City,Profession,University Degree,Body Height,Side Income,Total Income
0,1940.0,shared_place,17,Unhappy,other,45,Afghanistan,25179,group head,No,182,0.0,6182.05
1,1940.0,shared_place,4,Unhappy,female,17,Afghanistan,2278204,heavy vehicle and mobile equipment service tec...,No,172,0.0,6819.69
2,1940.0,shared_place,21,Unhappy,female,48,Afghanistan,822134,sorter,Bachelor,144,0.0,8663.53
3,1940.0,shared_place,18,Average,female,42,Albania,59477,quality control senior engineer,No,152,0.0,2400.64
4,1940.0,shared_place,8,Happy,other,15,Albania,23494,logistician,Master,180,0.0,2816.18


# TRAINING DATA PREPROCESSING

## - Dropping/Changing Rows/Columns

In [4]:
training_data.drop(columns=['Instance', 'Crime Level', 'Hair Color', 'Wears Glasses'], inplace=True)
test_data.drop(columns=['Crime Level', 'Hair Color', 'Wears Glasses', 'Total Income'], inplace=True)

In [5]:
training_data.drop_duplicates(inplace=True)

In [6]:
training_data.dropna(subset=['Year of Record'], inplace=True)

In [None]:
# these are the outliers: the rows that don't go well with the graph of 'Year of Record' x 'Total Income' and 'Housing situation' x 'Total Income'
training_data.drop(index=list(training_data[(training_data['Total Income'] > 500000) & (training_data['Total Income'] < 700000) & (training_data['Year of Record'] < 1995)].index), inplace=True)

In [7]:
training_data['Gender'] = training_data['Gender'].replace('f', 'female')
test_data['Gender'] = test_data['Gender'].replace('f', 'female')

In [8]:
training_data['Side Income'] = training_data['Side Income'].map(lambda x: x.rstrip('\sEUR'))
test_data['Side Income'] = test_data['Side Income'].map(lambda x: x.rstrip('\sEUR'))

## - Dealing with NaN values

In [9]:
column_NaN_values = {"Housing Situation" : "shared_place",
                    "Work Experience" : "0",
                    "Satisfation with employer" : "Somewhat Unhappy",
                    "Gender" : "unknown",
                    "Profession" : "no job",
                    "University Degree" : "No",
                    "Country" : "No",
                    "Wears Glasses" : 0}
                    #"Year of Record" : 0}

for col in training_data.dtypes.index.tolist():
    if col in column_NaN_values.keys():
        training_data[col] = training_data[col].fillna(value=column_NaN_values[col])
        test_data[col] = test_data[col].fillna(value=column_NaN_values[col])

test_data['Year of Record'] = test_data['Year of Record'].fillna(value=training_data['Year of Record'].mean())

## - Handling of Column Data Types

In [10]:
training_data['Work Experience'] = pd.to_numeric(training_data['Work Experience']).astype(int)
test_data['Work Experience'] = pd.to_numeric(test_data['Work Experience']).astype(int)

In [11]:
training_data['Side Income'] = pd.to_numeric(training_data['Side Income'])
test_data['Side Income'] = pd.to_numeric(test_data['Side Income'])

## - Encoding Categorical Features

In [None]:
#for col in training_data.dtypes[training_data.dtypes == 'object'].index.tolist():
#    le = LabelEncoder()
#    le.fit(training_data[col].unique())
#    training_data[col] = le.transform(training_data[col])
#    
#for col in test_data.dtypes[test_data.dtypes == 'object'].index.tolist():
#    le = LabelEncoder()
#    le.fit(test_data[col].unique())
#    test_data[col] = le.transform(test_data[col])

In [12]:
y = training_data['Total Income']
x = training_data.drop(columns=['Total Income'])
pred_instance = test_data['Instance']

In [13]:
target_encoder = TargetEncoder()
target_encoder.fit(x,y)

x = target_encoder.transform(x)
x_for_pred = target_encoder.transform(test_data.drop(columns=['Instance']))

## - Building the Model

In [14]:
X_train,X_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [15]:
d_train = lgbm.Dataset(X_train, label=y_train)
params = {'learning_rate' : 0.03,
         'boosting_type' : 'gbdt',
         'objective' : 'regression',
         'metric' : 'mae',
         'num_leaves' : 255,
         'num_trees' : 500,
         'num_threads' : 16}

# the last three parameters made the algorithm much much faster

reg = lgbm.train(params, d_train, 45000)



In [16]:
y_pred = reg.predict(x_for_pred)

In [17]:
sub_df = pd.DataFrame({'Instance': pred_instance,
                       'Total Yearly Income [EUR]': y_pred})

sub_df.to_csv("team203.csv",index=False)

## - Graphs

In [None]:
#sns.catplot(x="Satisfation with employer", y="Total Income", data=training_data[["Satisfation with employer", "Total Income"]])

In [None]:
#sns.countplot(training_data['Work Experience'])

In [None]:
#sns.relplot(x="Work Experience", y="Total Income", data=training_data[["Work Experience", "Total Income"]])

In [None]:
#training_data['Crime Level'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Crime Level'])

In [None]:
#sns.relplot(x="Crime Level", y="Total Income", data=training_data[["Crime Level", "Total Income"]]);

In [None]:
#training_data['Housing Situation'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Housing Situation'])

In [None]:
#sns.catplot(x="Housing Situation", y="Total Income", data=training_data[["Housing Situation", "Total Income"]])

In [None]:
#sns.countplot(training_data['Year of Record'])

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=training_data[["Year of Record", "Total Income"]]);

## - Side Work

In [20]:
training_data.corr()

In [None]:
#training_data.dropna(subset=['Year of Record', 'Country'], inplace=True) #******

In [None]:
sns.relplot(x="Crime Level", y="Total Income", data=training_data[["Crime Level", "Total Income"]])

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=pd.DataFrame([X_train["Year of Record"], y]))

In [None]:
regression = LinearRegression().fit(X_train, y_train)

In [None]:
regression.score(X_train, y_train)

In [None]:
pred = regression.predict(X_test)
#print("Root Mean squared Error")
#print((sqrt(mean_squared_error(np.exp(Y_test), np.exp(pred)))))

In [None]:
#poly = PolynomialFeatures(degree=15)
#poly.fit_transform(X_train)

In [None]:

X = [[0.44, 0.68], [0.99, 0.23]]
vector = [109.85, 155.72]
predict= [[0.49, 0.18]]
#Edit: added second square bracket above to fix the ValueError problem

poly = PolynomialFeatures(degree=2)
X_ = poly.fit_transform(X)
predict_ = poly.fit_transform(predict)

clf = linear_model.LinearRegression()
clf.fit(X_, vector)
print clf.predict(predict_)

In [None]:
tt = training_data.drop(columns=['Total Income'])
st = scale(tt)


In [None]:
t = training_data.drop(columns=['Total Income'])
names = t.columns
#scaled_t = scale(t)

scaler = StandardScaler()
scaled_t = scaler.fit_transform(t)
scaled_t = pd.DataFrame(scaled_t, columns=names)


#scaler = preprocessing.StandardScaler()# Fit your data on the scaler object
#scaled_df = scaler.fit_transform(df)
#scaled_df = pd.DataFrame(scaled_df, columns=names)

In [None]:
#scaled_t['Year of Record'].mean(axis=1)

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=[scaled_t["Crime Level"], training_data['Total Income']])

In [None]:
#X_train = pd.DataFrame(poly.fit_transform(X_train), columns=X_train.columns)
#poly.fit_transform(X_train)
#X_train.head()

In [None]:
#sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
#pd.DataFrame(sel.fit_transform(X_train))

In [None]:
#x = training_data[['Year of Record', 'Work Experience', 'Age', 'Side Income']]