In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
raw_df = pd.read_csv(filepath_or_buffer='wooutlier.csv')

In [3]:
raw_df=raw_df[raw_df['fusedyear']==2022]

In [4]:
cols=["fusedyear","fusedmonth","fusedday"]
raw_df['date'] = raw_df[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")

In [5]:
raw_df['weekDay']=pd.to_datetime(raw_df['date']).dt.dayofweek

In [6]:
del raw_df['fusedyear']
del raw_df['fusedmonth']
del raw_df['fusedday']
del raw_df['date']

In [7]:
raw_df.corr()

Unnamed: 0,fusedhour,toplam_yolcu,weekDay
fusedhour,1.0,0.453894,-0.017445
toplam_yolcu,0.453894,1.0,-0.145261
weekDay,-0.017445,-0.145261,1.0


In [8]:
train, test = train_test_split(raw_df, test_size=0.2)

In [9]:
cols=["weekDay","fusedhour"]
train_X=train[cols]

In [10]:
test_X=test[cols]

In [11]:
train_y=train['toplam_yolcu']

In [12]:
test_y=test['toplam_yolcu']

In [13]:
train_X

Unnamed: 0,weekDay,fusedhour
19512,6,2
19834,6,20
17180,6,0
16538,5,7
17714,3,14
...,...,...
19616,3,22
17717,3,19
17047,6,21
19735,2,7


In [14]:
train_y

19512      471
19834     9270
17180     1666
16538     5380
17714    11899
         ...  
19616     7000
17717    17770
17047     6173
19735    17607
19077    10045
Name: toplam_yolcu, Length: 3633, dtype: int64

In [15]:
test_X

Unnamed: 0,weekDay,fusedhour
19974,6,3
18795,6,21
19263,1,6
20066,3,12
17404,3,2
...,...,...
18956,0,17
18613,5,10
19509,5,23
20246,4,13


In [16]:
test_y

19974      166
18795    10302
19263     6484
20066     6650
17404       13
         ...  
18956    22300
18613     6926
19509     5419
20246     8788
15898      947
Name: toplam_yolcu, Length: 909, dtype: int64

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
reg = LinearRegression().fit(train_X,train_y)

In [19]:
reg.score(test_X,test_y)

0.22418578880931775

In [20]:
from sklearn import linear_model

In [21]:
clf = linear_model.Lasso(alpha=0.1)

In [22]:
clf.fit(train_X,train_y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [23]:
clf.score(test_X,test_y)

0.2241862544710942

In [24]:
from sklearn.linear_model import Ridge

In [25]:
clf2 = Ridge(alpha=0.1)

In [26]:
clf2.fit(train_X,train_y)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [27]:
clf2.score(test_X,test_y)

0.22418584488529847

In [28]:
from sklearn.linear_model import ElasticNet

In [29]:
regr = ElasticNet(alpha=0.1)

In [30]:
regr.fit(train_X,train_y)

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [31]:
regr.score(test_X,test_y)

0.2242831198683829

In [32]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [33]:
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', LinearRegression(fit_intercept=False))])

In [34]:
model.fit(train_X,train_y)

Pipeline(memory=None,
         steps=[('poly',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('linear',
                 LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [35]:
model.score(test_X,test_y)

0.5522809558162829

In [36]:
from sklearn.linear_model import RANSACRegressor

In [37]:
ransac = RANSACRegressor(LinearRegression(),
		max_trials=4, 		# Number of Iterations
		min_samples=2, 		# Minimum size of the sample
		loss='absolute_loss', 	# Metrics for loss
		residual_threshold=10 	# Threshold
		)

In [38]:
ransac.fit(train_X,train_y)

RANSACRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                                n_jobs=None, normalize=False),
                is_data_valid=None, is_model_valid=None, loss='absolute_loss',
                max_skips=inf, max_trials=4, min_samples=2, random_state=None,
                residual_threshold=10, stop_n_inliers=inf,
                stop_probability=0.99, stop_score=inf)

In [39]:
ransac.score(test_X,test_y)

-0.8962280138286591

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
forest=RandomForestRegressor(n_estimators=20,
                             max_depth=20,
                             criterion='mse',
                            )

In [42]:
forest.fit(train_X,train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [43]:
forest.score(test_X,test_y)

0.8997782981276871

In [50]:
# Saving the model
import pickle
filename = '/root/staj/model.sav'
pickle.dump(forest, open(filename, 'wb'))

In [44]:
from sklearn.preprocessing import StandardScaler

In [45]:
pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('poly', PolynomialFeatures(degree=2)),
    ('forest', RandomForestRegressor(n_estimators=20,
                             max_depth=20,
                             criterion='mse',
                            ))
])

In [46]:
pipe.fit(train_X,train_y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('poly',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('forest',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=20,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=20, n_jobs=None,
                                       oob_score=False, ran

In [47]:
pipe.score(test_X,test_y)

0.8991378065535293

In [48]:
from sklearn.svm import SVR

# Choose regression method and set hyperparameter
svr_rbf=SVR(C=1.0, epsilon=0.2, kernel='rbf')

# Training of the regression model
svr_rbf.fit(train_X,train_y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [49]:
svr_rbf.score(test_X,test_y)

0.10009160498841962