In [140]:
from __future__ import unicode_literals
from hazm import *
from hazm import utils as utl
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_selection
import math
import numpy as np

from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import (CountVectorizer, 
                                             TfidfVectorizer,
                                             TfidfTransformer)

In [157]:
train = pd.read_csv('vehicles.csv')
train = pd.DataFrame(train)

train['mileage'].fillna(value=train['mileage'].mean(), inplace=True)
train['year'].fillna(value=train['year'].value_counts().idxmax(), inplace=True)
train['brand'].fillna(value='سایر', inplace=True)
train['year'] = train['year'].replace(to_replace='<1366', value='1355')

train['year'] = list(map(int, train['year']))


date_dict = {'Saturday': 0,
            'Sunday':1,
            'Monday':2,
            'Tuesday':3,
            'Wednesday':4,
            'Thursday':5,
            'Friday':6}

def date_converter(x):
    x = x.split()
    day = date_dict[x[0]]
    
    hour = 0
    if x[1][2] == 'A':
        if int(x[1][:2]) == 12:
            hour = 0
        hour = int(x[1][:2])
    else:
        if int(x[1][:2]) == 12:
            hour = 12
        hour = int(x[1][:2]) + 12
        
    return day * 24 + hour

train['created_at'] = list(map(date_converter, train['created_at']))
train['category'] = train['category'].replace(to_replace = ['light', 'heavy'], value = [0, 1])

index = 0
brand_dict = dict()
for i in train['brand'].unique():
    brand_dict.update({i:index})
    index += 1
train['brand'] = train['brand'].replace(brand_dict)

In [158]:

normalizer = Normalizer()
lemmatizer = Lemmatizer()
stop_words = open('stopwords', 'r').read().splitlines()
stop_words = [lemmatizer.lemmatize(w) for w in stop_words]
stop_words.append('\n')
stop_words.append('\r')


In [159]:
train = train.drop('description', 1)

for index,row in train.iterrows():
    normalizer.normalize(row['title'])
    new_list = list()
    for w in word_tokenize(row['title']):
        word = lemmatizer.lemmatize(w)
        if word not in stop_words:
            new_list.append(w)
    row['title'] = new_list   
    


In [160]:
vectorizer = TfidfVectorizer(analyzer='word', min_df=500, max_features=40)  
vectorized = vectorizer.fit_transform(train['title'])
q = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())

train[vectorizer.get_feature_names()] = q[vectorizer.get_feature_names()].values
train = train.drop('title', 1)
train.info()
q.info()
# train = train.drop('description', 1)



train

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130443 entries, 0 to 130442
Data columns (total 47 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   brand        130443 non-null  int64  
 1   category     130443 non-null  int64  
 2   created_at   130443 non-null  int64  
 3   image_count  130443 non-null  int64  
 4   mileage      130443 non-null  float64
 5   price        130443 non-null  int64  
 6   year         130443 non-null  int64  
 7   206          130443 non-null  float64
 8   405          130443 non-null  float64
 9   83           130443 non-null  float64
 10  89           130443 non-null  float64
 11  90           130443 non-null  float64
 12  93           130443 non-null  float64
 13  ال           130443 non-null  float64
 14  ام           130443 non-null  float64
 15  ای           130443 non-null  float64
 16  بدون         130443 non-null  float64
 17  بک           130443 non-null  float64
 18  بی           130443 non-

Unnamed: 0,brand,category,created_at,image_count,mileage,price,year,206,405,83,...,مشکی,نیسان,هاچ,وانت,وی,پارس,پراید,پژو,پیکان,کارخانه
0,0,1,91,4,100862.291944,-1,1393,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
1,0,0,100,3,180000.000000,-1,1366,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
2,1,0,107,0,290000.000000,8500000,1381,0.0,0.0,0.0,...,0.775334,0.0,0.0,0.0,0.0,0.0,0.000000,0.488448,0.0,0.000000
3,2,0,109,3,175000.000000,19500000,1372,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
4,3,0,127,4,80000.000000,23900000,1391,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.667040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130438,0,1,111,2,100862.291944,48000000,1393,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
130439,0,1,114,4,100862.291944,-1,1393,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
130440,3,0,114,3,20000.000000,-1,1392,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
130441,5,0,135,4,123000.000000,6900000,1379,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.494819,0.000000,0.0,0.000000


In [161]:
x = train.drop('price', 1)
y = train.price

In [186]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

mi = mutual_info_regression(X_train, y_train)

KeyboardInterrupt: 

In [185]:
mu = pd.Series(mi)
mu.index = X_train.columns
y

0               -1
1               -1
2          8500000
3         19500000
4         23900000
            ...   
130438    48000000
130439          -1
130440          -1
130441     6900000
130442    17400000
Name: price, Length: 130443, dtype: int64

In [194]:
from sklearn.metrics import mean_squared_error

def train(model,X_train, X_test, y_train):
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)
    return y_pred

def test(y_test, y_pred):
    mse = mean_squared_error(np.array(y_test), y_pred)
    rmse = mean_squared_error(np.array(y_test), y_pred, squared = False)
    return mse,rmse

In [201]:
from sklearn.tree import DecisionTreeRegressor


param_grid = {
    'random_state': [0,42,60],
    'max_depth': range(5,70),
    'n_estimators': [20,100, 200, 300, 1000]
}


clf = DecisionTreeRegressor(max_depth = 5)
# grid_search = GridSearchCV(estimator = clf, param_grid = param_grid,cv = 3, n_jobs = -1, verbose = 2)

y_pred = train(clf,X_train, X_test, y_train)
mse, rmse = test(y_test, y_pred)

mse,rmse

(582649135113486.8, 24138126.17237483)