In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error

In [3]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 52kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [4]:
from catboost import CatBoostRegressor

In [5]:
df = pd.read_csv("gdrive/My Drive/reworked_data.csv")

In [6]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,id,occupation_type,age,years_employed,num_of_children,gender,own_car,own_real_estate,income_total,income_type,education,family_status,housing_type,num_of_family_members
0,0,5008804,Unknown,32,12,0,M,Y,Y,427500.0,Working,Higher education,Civil marriage,Rented apartment,2.0


In [7]:
del df["Unnamed: 0"], df["id"]

In [8]:
df.head(1)

Unnamed: 0,occupation_type,age,years_employed,num_of_children,gender,own_car,own_real_estate,income_total,income_type,education,family_status,housing_type,num_of_family_members
0,Unknown,32,12,0,M,Y,Y,427500.0,Working,Higher education,Civil marriage,Rented apartment,2.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   occupation_type        438557 non-null  object 
 1   age                    438557 non-null  int64  
 2   years_employed         438557 non-null  int64  
 3   num_of_children        438557 non-null  int64  
 4   gender                 438557 non-null  object 
 5   own_car                438557 non-null  object 
 6   own_real_estate        438557 non-null  object 
 7   income_total           438557 non-null  float64
 8   income_type            438557 non-null  object 
 9   education              438557 non-null  object 
 10  family_status          438557 non-null  object 
 11  housing_type           438557 non-null  object 
 12  num_of_family_members  438557 non-null  float64
dtypes: float64(2), int64(3), object(8)
memory usage: 43.5+ MB


In [10]:
target = "income_total"
X = df.drop(target, axis=1)
y = df[target]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [13]:
mapper = DataFrameMapper([
    ("occupation_type", LabelBinarizer()),
    (["age"], StandardScaler()),
    (["years_employed"], StandardScaler()),
    (["num_of_children"], StandardScaler()),
    ("gender", LabelBinarizer()),
    ("own_car", LabelBinarizer()),
    ("own_real_estate", LabelBinarizer()),
    ("income_type", LabelBinarizer()),
    ("education", LabelBinarizer()),
    ("family_status", LabelBinarizer()),
    ("housing_type", LabelBinarizer()),
    (["num_of_family_members"], StandardScaler())
    ],df_out=True)

In [14]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [15]:
model = LinearRegression().fit(Z_train,y_train)
print("LinearRegression train score is " + str(model.score(Z_train,y_train)))
print("LinearRegression test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

LinearRegression train score is 0.1612914857723141
LinearRegression test score is 0.16283204405425056
Mean squared error is 99249.52623851255


In [16]:
model = HuberRegressor().fit(Z_train,y_train)
print("HuberRegressor train score is " + str(model.score(Z_train,y_train)))
print("HuberRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

HuberRegressor train score is 0.1372243811625069
HuberRegressor test score is 0.13860871371008243
Mean squared error is 100675.17220723415


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [17]:
model = Lasso().fit(Z_train,y_train)
print("Lasso train score is " + str(model.score(Z_train,y_train)))
print("Lasso test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Lasso train score is 0.16129776778848226
Lasso test score is 0.16285235217551697
Mean squared error is 99248.32242772177


  positive)


In [18]:
model = Ridge().fit(Z_train,y_train)
print("Ridge train score is " + str(model.score(Z_train,y_train)))
print("Ridge test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Ridge train score is 0.1612996422077173
Ridge test score is 0.1628588949059796
Mean squared error is 99247.93458915294


In [19]:
model = ElasticNet().fit(Z_train,y_train)
print("ElasticNet train score is " + str(model.score(Z_train,y_train)))
print("ElasticNet test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

ElasticNet train score is 0.09484618426736124
ElasticNet test score is 0.097081235520868
Mean squared error is 103073.37184410098


In [20]:
model = RandomForestRegressor().fit(Z_train,y_train)
print("RandomForestRegressor train score is " + str(model.score(Z_train,y_train)))
print("RandomForestRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

RandomForestRegressor train score is 0.8939010483657379
RandomForestRegressor test score is 0.8441395900728706
Mean squared error is 42824.27814358648


In [21]:
model = DecisionTreeRegressor().fit(Z_train,y_train)
print("DecisionTreeRegressor train score is " + str(model.score(Z_train,y_train)))
print("DecisionTreeRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

DecisionTreeRegressor train score is 0.9001299549906846
DecisionTreeRegressor test score is 0.8308588906457107
Mean squared error is 44611.49027892762


In [22]:
model = AdaBoostRegressor().fit(Z_train,y_train)
print("AdaBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("AdaBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

AdaBoostRegressor train score is -0.6106218727895794
AdaBoostRegressor test score is -0.6678904410594253
Mean squared error is 140089.60279194836


In [23]:
model = CatBoostRegressor().fit(Z_train,y_train)
print("CatBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("CatBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Learning rate set to 0.109507
0:	learn: 108872.5544997	total: 104ms	remaining: 1m 44s
1:	learn: 107445.3362607	total: 156ms	remaining: 1m 17s
2:	learn: 106293.5925346	total: 204ms	remaining: 1m 7s
3:	learn: 105367.1509498	total: 254ms	remaining: 1m 3s
4:	learn: 104622.8088784	total: 301ms	remaining: 59.9s
5:	learn: 104010.9183649	total: 348ms	remaining: 57.7s
6:	learn: 103516.6709423	total: 415ms	remaining: 58.9s
7:	learn: 103100.2211025	total: 464ms	remaining: 57.5s
8:	learn: 102756.1668238	total: 511ms	remaining: 56.3s
9:	learn: 102475.9441546	total: 563ms	remaining: 55.7s
10:	learn: 102216.5904377	total: 613ms	remaining: 55.1s
11:	learn: 102030.8123749	total: 663ms	remaining: 54.6s
12:	learn: 101840.8011497	total: 717ms	remaining: 54.5s
13:	learn: 101702.4889009	total: 770ms	remaining: 54.2s
14:	learn: 101561.9427148	total: 823ms	remaining: 54.1s
15:	learn: 101432.1154006	total: 877ms	remaining: 53.9s
16:	learn: 101341.3965476	total: 922ms	remaining: 53.3s
17:	learn: 101204.6729423	

In [24]:
## Best performing model
model = RandomForestRegressor().fit(Z_train,y_train)
print("RandomForestRegressor train score is " + str(model.score(Z_train,y_train)))
print("RandomForestRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

RandomForestRegressor train score is 0.8934679932976399
RandomForestRegressor test score is 0.8440020511286295
Mean squared error is 42843.1691088139


In [25]:
from sklearn.pipeline import make_pipeline

In [26]:
pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('dataframemapper',
                 DataFrameMapper(default=False, df_out=True,
                                 features=[('occupation_type',
                                            LabelBinarizer(neg_label=0,
                                                           pos_label=1,
                                                           sparse_output=False)),
                                           (['age'],
                                            StandardScaler(copy=True,
                                                           with_mean=True,
                                                           with_std=True)),
                                           (['years_employed'],
                                            StandardScaler(copy=True,
                                                           with_mean=True,
                                                           with_std=True)),
                                 

In [27]:
df.head(1)

Unnamed: 0,occupation_type,age,years_employed,num_of_children,gender,own_car,own_real_estate,income_total,income_type,education,family_status,housing_type,num_of_family_members
0,Unknown,32,12,0,M,Y,Y,427500.0,Working,Higher education,Civil marriage,Rented apartment,2.0


In [32]:
df["housing_type"].value_counts()

House / apartment      393831
With parents            19077
Municipal apartment     14214
Rented apartment         5974
Office apartment         3922
Co-op apartment          1539
Name: housing_type, dtype: int64

In [42]:
new_M = pd.DataFrame({
    'occupation_type': "Managers",
    "age": [50],
    "years_employed": [20],
    "num_of_children": [3],
    "gender": "M",
    "own_car": "Y",
    "own_real_estate": "Y",
    "income_type": "Working",
    "education": "Higher education",
    "family_status": "Separated",
    "housing_type": "House / apartment",
    "num_of_family_members": [3]
  })

In [43]:
new_F = pd.DataFrame({
    'occupation_type': "Managers",
    "age": [50],
    "years_employed": [20],
    "num_of_children": [3],
    "gender": "F",
    "own_car": "Y",
    "own_real_estate": "Y",
    "income_type": "Working",
    "education": "Higher education",
    "family_status": "Separated",
    "housing_type": "House / apartment",
    "num_of_family_members": [3]
  })

In [44]:
print(pipe.predict(new_M), pipe.predict(new_F))

[326835.] [368865.]


In [None]:
# starting codes - going to manipulate data to show how easily bias can be created