In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures

In [2]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [3]:
train_raw = pd.read_csv('Dataset/train.csv')
test_raw = pd.read_csv('Dataset/test.csv')

train = train_raw[:100000]

In [4]:
train.set_index('id', inplace=True)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    100000 non-null  object
 1   seller  100000 non-null  int64 
 2   item    100000 non-null  int64 
 3   sales   100000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 3.8+ MB


In [6]:
train.isnull().sum()

date      0
seller    0
item      0
sales     0
dtype: int64

In [7]:
train.describe()

Unnamed: 0,seller,item,sales
count,100000.0,100000.0,100000.0
mean,5.5,3.3,44.37659
std,2.872296,1.615558,23.286029
min,1.0,1.0,1.0
25%,3.0,2.0,26.0
50%,5.5,3.0,40.0
75%,8.0,5.0,59.0
max,10.0,6.0,164.0


In [8]:
train.seller.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [9]:
train.item.unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [10]:
train.sales.unique()

array([ 21,  45,  28,  34,  27,  25,  26,  37,  32,  22,  20,  35,  38,
        24,  40,  33,  11,  29,  39,  17,  30,  53,  47,  41,  46,  16,
        14,  43,  19,  44,  23,  18,  42,  15,  31,  36,  48,   8,  13,
         9,  49,  12,  51,  55,  52,  57,  56,  62,  59,  54,  61,  58,
        60,  50,  64,  63,  75,  66,  82,  72,  70,  76,  67,  78,  68,
        77,  65,  74,  73,  84,  71,  69,  81,  80,  91,  79,  93,  83,
        87,  98,  85, 100,  90,  94,  86,  88,  89,  95,  92, 105,  97,
       102,  96, 103, 113, 106,  99, 109, 107, 104, 108, 110, 101, 120,
       117, 116, 114, 115, 126, 118, 112, 119, 111, 123, 130, 128, 124,
        10,   7,   3,   5,   6,   4, 121, 122, 127, 132, 125, 129, 143,
       131, 133, 134, 138, 153, 150, 148, 141, 140, 152, 135, 139, 136,
       137, 144, 149, 142, 157, 164, 161, 147, 154, 145,   2,   1],
      dtype=int64)

In [11]:
train.date = pd.to_datetime(train['date'], format='%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [12]:
corr_matrix = train.corr()

In [13]:
corr_matrix['sales'].sort_values()

item     -0.048107
seller   -0.009112
sales     1.000000
Name: sales, dtype: float64

In [14]:
feature = ['item', 'seller']
X = train[feature]
y = train['sales']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [16]:
model = RandomForestRegressor()

In [17]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [18]:
val_predict = model.predict(X_test)

In [19]:
print(smape(y_test, val_predict))

24.555502928582673


In [20]:
polynomial_svm_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("svm_clf", LinearSVC(C=10, loss='hinge'))
])

In [21]:
svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(X_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [22]:
val_predict = svm_reg.predict(X_test)

In [23]:
print(smape(y_test, val_predict))

43.40078836467707


In [None]:
svm_poly = SVR(kernel='poly', degree=2, C=100, epsilon=0.1)
svm_poly.fit(X_train, y_train)

In [None]:
val_predict = svm_poly.predict(X_test)

In [None]:
print(smape(y_test, val_predict))