-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
438 lines (323 loc) · 17.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from sklearn import preprocessing
from sklearn import linear_model
import numpy as np
def main():
cars = pd.read_csv('input/autos.csv', encoding='latin_1')
print(">>> input length: {}".format(cars.shape[0])) # prints the length of data
print(">>> column length: {}".format(cars.shape[1])) # prints the length of column
# shows empty records and draw graph
# ### show_empty_records(cars)
# draw the rows that have null values
# ### cars_without_null_values = cars.dropna()
# ### print(">>> {} rows over {} is deleted that has null value at least one column on it\n"
# ### .format(cars.shape[0]-cars_without_null_values.shape[0], cars.shape[0]))
# ### describe_data_frame(cars) # creates table for the
# plots the histogram graph of the columns
# ### histogram_visualization_of_the_data_frame(cars)
clean_data = data_cleaning_for_regression(cars)
# export clean data to csv format for WEKA
data_frame_to_csv(clean_data, "output/clean_data.csv")
# runs the linear regressionmodel for clean data
regression_model(clean_data)
plt.show()
# plots the scatter graph of two columns
# ### draw_scatter_visualization_of_two_column(clean_data, 'yearOfRegistration', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'powerPS', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'brand', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'kilometer', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'gearbox', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'vehicleType', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'fuelType', 'price')
# ### draw_scatter_visualization_of_two_column(clean_data, 'notRepairedDamage', 'price')
# Shows the first five records
# print(">>> first five records:\n{}".format(cars.head()))
# print(">>> cars columns:\n{}".format(cars.columns))
# print(">>> Column Data_Type\n{}".format(cars.dtypes))
def data_cleaning_for_regression(cars):
""" Prepares the data for machine learning model.
# #################### COLUMNS THAT ARE COMPLETELY DELETED #################### #
dateCrawled: column is REMOVED.
nrOfPictures: has no value. column is REMOVED.
name: all values are almost unique. So, it has no meaning for model. column is REMOVED.
seller: has only 3 different values. So, it has no mean for model. rows and columns are REMOVED.
offerType: has only 12 different values. rows and columns are REMOVED.
abtest: column REMOVED.
monthOfRegistration: has 13 values should be 12. not necessary for model. columns are REMOVED.
dateCreated: column is REMOVED.
postalCode: column is REMOVED.
lastSeen: column is REMOVED.
# #################### COLUMNS THAT ARE PARTIALLY DELETED #################### #
price: limits the price between 0< price < 100.000. rows are CLEANED.
yearOfRegistration: limits the year between 1900<year<2017. rows are CLEANED.
powerPS: limits powerPS between 0<powerps<1000. rows are CLEANED.
fuelType: 5 type of values are removed other than "benzin" and "diesel". missing rows are REMOVED.
# #################### COLUMNS THAT HAVE CONSISTENT VALUES #################### #
vehicleType: FINE
model: FINE
kilometer: FINE
gearbox: FINE. missing rows are REMOVED.
brand: FINE
notRepairedDamage: FINE
:param cars: pandas's dataframe
:return: pandas's dataframe object
"""
# ###############################################################################
# #################### COLUMNS THAT ARE COMPLETELY DELETED #################### #
# ###############################################################################
# dateCrawled does not effect model. Removes the dateCrawled column
clean_data = cars.drop('dateCrawled', 1)
# pictures column has no value. Removes the picture column
clean_data = clean_data.drop('nrOfPictures', 1)
# name column has 233531 different value over 371528.
# So, name column is not a good value for our model.
# There can be extracted some useful information by NLP.
# However, I am removing this column for my model.
clean_data = clean_data.drop('name', 1) # removes the name column
# prints the seller column's information
# ### print(clean_data.groupby('seller').size())
# seller
# gewerblich 3
# privat 371525
clean_data = clean_data[clean_data.seller != 'gewerblich'] # removes the gewerblich rows
clean_data = clean_data.drop('seller', 1) # removes the seller column
# ### print(clean_data.groupby('offerType').size())
# offerType
# Angebot 371513
# Gesuch 12
clean_data = clean_data[clean_data.offerType != 'Gesuch'] # removes the gesuch rows
clean_data = clean_data.drop('offerType', 1) # removes the offerType column
# based on the comment on the website, it should be better to remove this column
clean_data = clean_data.drop('abtest', 1) # removes the abtest column
# month has 13 unique values. It should have 12 values.
# month values is not necessary for the price prediction.
# So, it can be removed.
# ### draw_column_histogram(clean_data, 'monthOfRegistration')
clean_data = clean_data.drop('monthOfRegistration', 1) # removes the month column
# to have better understanding postalCode should be matched with location in Germany.
# Location of the cars in Germany will not effect the price of the car.
# So, we can remove the postalCode column
# ### draw_column_histogram(clean_data, 'postalCode')
clean_data = clean_data.drop('postalCode', 1) # removes the postalCode column
# dateCreated has no effect on our model. dateCreated column is removed.
# print(clean_data['dateCreated'].describe())
clean_data = clean_data.drop('dateCreated', 1) # removes the dateCreated column
# lastSeen column as 157877 unique value. Has no effect on model.
# So, lastSeen column is removed
# ### draw_column_histogram_string(clean_data, 'lastSeen') # too many data for histogram
clean_data = clean_data.drop('lastSeen', 1) # removes the lastSeen column
# ##############################################################################
# #################### COLUMNS THAT ARE PARTIALLY DELETED #################### #
# ##############################################################################
# for removing the overestimated price. limits the max price 100.000
clean_data = clean_data[clean_data.price < 100000]
# 10772 cars are free. So, it is not a consistent value for the model. price=0 rows are removed.
# ### print(len(clean_data[clean_data.price == 0]), 'cars with price 0')
clean_data = clean_data[clean_data.price != 0]
# yearOfRegistration column has some inconsistent values.
# To reduces the number of the inconsistent rows, limit year between 1900<year<2017
clean_data = clean_data[(clean_data.yearOfRegistration >= 1900) & (clean_data.yearOfRegistration < 2017)]
# powerPS column has inconsistent values.
# To reduces the number of the inconsistent rows, limit powerPS between 0<powerPS<1000
clean_data = clean_data[(clean_data.powerPS > 0) & (clean_data.powerPS < 1000)]
# fuelType has 7 unique values. benzin and diesel dominates the other values.
# to have better values and simplify the data set, other values are removed.
# ### draw_column_histogram_string(clean_data, 'fuelType')
clean_data = clean_data[clean_data.fuelType != 'lpg'] # removes the rows
clean_data = clean_data[clean_data.fuelType != 'cng'] # removes the rows
clean_data = clean_data[clean_data.fuelType != 'hybrid'] # removes the rows
clean_data = clean_data[clean_data.fuelType != 'andere'] # removes the rows
clean_data = clean_data[clean_data.fuelType != 'elektro'] # removes the rows
# ### draw_column_histogram_string(clean_data, 'fuelType')
# ###############################################################################
# #################### COLUMNS THAT HAVE CONSISTENT VALUES #################### #
# ###############################################################################
# vehicleType looks fine.
# ### draw_column_histogram_string(clean_data, 'vehicleType')
# model looks fine.
# ### draw_column_histogram_string(clean_data, 'model')
# kilometer data looks fine
# ### draw_column_histogram(clean_data, 'kilometer')
# gearbox name histogram shows that the values are ok.
# ### draw_column_histogram_string(clean_data, 'gearbox')
# notRepairedDamage name histogram shows that the values are ok.
# ### draw_column_histogram_string(clean_data, 'notRepairedDamage')
# brand name histogram shows that the values are ok.
# ### draw_column_histogram_string(clean_data, 'brand')
clean_data = clean_data.dropna() # removes the rows that has missing values
# print(clean_data.head)
# ### print("cars: {}\nclean_Data: {}".format(cars.shape, clean_data.shape))
# reorders the column places
clean_data = clean_data[['vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
'kilometer', 'fuelType', 'brand', 'notRepairedDamage', 'price']]
return clean_data
# ############################################################################# #
# ######################### DATAFRAME EXPORTING METHOD ######################## #
def data_frame_to_excel(data_frame, file_name):
"""Converts pandas's dataframe object to excel format.
:param data_frame: pandas.dataframe
:param file_name: string
:return: None
"""
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(file_name, engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
data_frame.to_excel(writer, sheet_name='sheet1')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def data_frame_to_csv(data_frame, file_name):
"""Converts pandas's dataframe object to csv format.
:param data_frame: pandas.dataframe
:param file_name: string
:return: None
"""
# sample_data = data_frame.sample(n=100, random_state=1)
# sample_data.to_csv(file_name, sep=',', index=False)
data_frame.to_csv(file_name, sep=',', index=False)
# ############################################################################# #
# ######################## DATA VISUALIZATION METHODS ######################### #
def histogram_visualization_of_the_data_frame(cars):
"""Plot the histogram graph of each column in the dataframe.
:param cars: pandas.dataframe
:return: None
"""
for column in cars:
if column == "dateCrawled" or column == "name" or column == "lastSeen":
continue
elif column == "price" or column == "postalCode" or column == "powerPS":
draw_column_histogram(cars, column)
else:
draw_column_histogram_string(cars, column)
def draw_column_histogram(data_frame, column_name):
"""Plots the histogram graph of the given column which has number value.
:param data_frame: pandas.dataframe
:param column_name: string
:return: None
"""
# ### print(data_frame[column_name].describe())
# sample_data = data_frame.sample(n=10000, random_state=1)
data_frame[column_name].hist(bins=20)
plt.title('Histogram ' + column_name)
plt.show()
def draw_column_histogram_string(data_frame, column_name):
"""Plots the histogram graph of the given column which has string value.
:param data_frame: pandas.dataframe
:param column_name: string
:return: None
"""
# ### print(data_frame[column_name].describe())
# sample_data = data_frame.sample(n=10000, random_state=1)
values = pd.Series(data_frame[column_name])
d = pd.DataFrame({column_name: values})
d.apply(pd.value_counts).plot(kind='bar', subplots=True)
# plt.title('Histogram ' + column_name)
plt.show()
def draw_scatter_visualization_of_two_column(cars, column_1, column_2):
"""Plots scatter graph for comparison of two given columns.
:param cars: dataframe
:param column_1: string
:param column_2: string
:return: None
"""
plt.scatter(cars[column_1], cars[column_2])
plt.xlabel(column_1)
plt.ylabel(column_2)
plt.show()
# ############################################################################# #
# ########################## DATA INSPECTION METHODS ########################## #
def describe_data_frame(cars):
"""Shows the highlights of the dataframe to have a general view.
:param cars: pandas.dataframe
:return: pandas.dataframe
"""
# calculates the general information for the dataframe
cars_describe = cars.describe(include='all').loc[['count', 'unique', 'top', 'freq']]
unique_value_list = []
for column in cars:
unique_value_list.append(cars[column].unique())
# print("{}'s has {} unique_values: {}\n".format(column, len(cars[column].unique()), cars[column].unique()))
# insert unique values to table
cars_describe_transpose = cars_describe.transpose()
cars_describe_transpose["unique_value_list"] = unique_value_list
# export dataframe to excel
# ### data_frame_to_excel(cars_describe_transpose, 'output/cars_describe.xlsx')
return cars_describe_transpose.transpose()
def show_empty_records(cars):
"""Prints the total missing value for each column and draw bar graph.
:param cars: data frame
:return: prints values and draw bar graph
"""
print("Column_Name \tNumber_of_Missing_Value\n{}".format(cars.isnull().sum()))
data_frame = pd.DataFrame()
print("sum: {}".format(cars.isnull().sum().tolist()))
data_frame['x'] = cars.columns.tolist()
data_frame['y'] = cars.isnull().sum().tolist()
graph = sns.barplot(data_frame.x, data_frame.y, palette="BuGn_d")
graph.set(ylim=(0, cars.shape[0]))
plt.xticks(rotation=90)
def regression_model(clean_data):
cleaned_data = clean_data[['vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
'kilometer', 'fuelType', 'brand', 'notRepairedDamage', 'price']]
# encode the data
cleaned_encoded_data = cleaned_data.apply(preprocessing.LabelEncoder().fit_transform)
# print(cleaned_encoded_data.head())
# ### print("encoded: {}".format(cleaned_encoded_data))
# Instantiate the model
linear_regression_model = linear_model.LinearRegression()
# fit the model. first parameter attributes. second parameter is result(price in this model).
linear_regression_model.fit(cleaned_encoded_data.iloc[:, :9], cleaned_encoded_data.iloc[:, 9])
# Regression coefficients
print("coefficient : {}".format(linear_regression_model.coef_))
# Model intercept
print("model_intercept: {}".format(linear_regression_model.intercept_))
# ############################################################################# #
# ################################ VALIDATION ################################# #
from sklearn.model_selection import train_test_split
# Split into train and validation
x_train, x_test, y_train, y_test = train_test_split(cleaned_encoded_data.iloc[:, :9],
cleaned_encoded_data.iloc[:, 9],
test_size=0.3)
# Display data shape
print("cleaned_shape: {}\n"
"x_train_shape: {}\n"
"y_train_shape: {}\n"
"x_test_shape : {}\n"
"y_test_shape : {}\n"
.format(cleaned_encoded_data.shape, x_train.shape, y_train.shape, x_test.shape, y_test.shape))
# Instantiate the model
model_sklearn_tv = linear_model.LinearRegression()
# fit the model
model_sklearn_tv.fit(x_train, y_train)
# LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
y_pred = model_sklearn_tv.predict(x_test)
print("y_pred_shape: {}\n".format(y_pred.shape))
# Find error : RMSE
print("RMSE ERROR: {}".format(np.sqrt(np.mean((y_test - y_pred)**2))))
# print(y_pred)
y_pred_list = y_pred.tolist()
y_test_list = y_test.tolist()
a = y_test_list[:200]
b = y_pred_list[:200]
print(a)
print(b)
print([abs(x - y) for x, y in zip(a, b)])
"""
result = y_test
se = pd.Series(y_pred_list)
result['prediction'] = se.values
print(result.head())
# data_frame_to_excel(y_test,'output/result.excel')
# Plot outputs
plt.scatter(x_test, y_test, color='g')
plt.plot(x_test, model_sklearn_tv.predict(x_test), color='k', linewidth=3)
plt.show()
"""
# MAE mean absolute error. best value is 0.0
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
print("mean_absulute_error: {}".format(mean_absolute_error(y_test, y_pred)))
print("mean_squared_error: {}".format(mean_squared_error(y_test, y_pred)))
if __name__ == "__main__":
main()