**Подключение библиотек и скриптов**

In [1]:
import numpy as np
from numpy.linalg import inv
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def r2_quality(y_prd, y_tru):
    y_mn = np.average(y_tru,axis=0)
    return 1.0 - np.sum((y_prd - y_tru)**2)/np.sum((y_tru - y_mn)**2)

**Пути к директориям и файлам**

In [3]:
# input
TRAIN_DATASET_PATH = 'train.csv'
TEST_DATASET_PATH  = 'test.csv'

# output
PREP_DATASET_PATH  = 'GB_predictions.csv'

**Загрузка  данных<a class="anchor" id="load_data"></a>**

In [49]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,0,40.0,0.0,1400.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,63.0
1,1,48.0,4.0,2850.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,86.0
2,2,39.0,0.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
3,3,46.0,5.0,1400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
4,4,43.0,1.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0


In [50]:
df.corr()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
Id,1.0,-0.004596,0.007408,-0.004433,-0.005077,-0.01057,0.002694,-1.6e-05,0.017723,-0.014869,-0.004482,0.004121
age,-0.004596,1.0,0.059947,-0.005462,-0.000976,0.004045,0.00125,-0.005026,-0.012546,0.01313,0.010606,-0.007646
years_of_experience,0.007408,0.059947,1.0,0.248311,0.194097,0.008451,0.004246,-0.001722,-0.010241,-0.011129,0.01864,0.205417
lesson_price,-0.004433,-0.005462,0.248311,1.0,0.790087,-0.006432,0.00513,-0.00486,-0.012018,0.010525,-0.001142,0.721179
qualification,-0.005077,-0.000976,0.194097,0.790087,1.0,0.007529,-0.002683,-0.007504,-0.008047,0.00361,-0.005109,0.755963
physics,-0.01057,0.004045,0.008451,-0.006432,0.007529,1.0,0.019852,0.000661,0.004238,0.001904,0.004867,0.187726
chemistry,0.002694,0.00125,0.004246,0.00513,-0.002683,0.019852,1.0,0.007866,0.009974,-0.004447,-0.008079,0.017825
biology,-1.6e-05,-0.005026,-0.001722,-0.00486,-0.007504,0.000661,0.007866,1.0,0.010146,-0.013042,0.010995,0.023022
english,0.017723,-0.012546,-0.010241,-0.012018,-0.008047,0.004238,0.009974,0.010146,1.0,-0.008148,-0.00456,0.013174
geography,-0.014869,0.01313,-0.011129,0.010525,0.00361,0.001904,-0.004447,-0.013042,-0.008148,1.0,-0.005049,0.014401


**Подготовка данных**

In [51]:
# отделим таргет и data
y_tot = df['mean_exam_points']
df_X  = pd.DataFrame(df, columns=['years_of_experience','lesson_price','qualification','physics','chemistry','biology'])

In [52]:
# разобъем на тренировочный и тестовый датасеты
X_trn,X_tst, y,y_tst = train_test_split(df_X,y_tot, test_size = 0.3, random_state = 53)

# и нормализуем эти признаки
X = np.array((X_trn-X_trn.min(axis=0))/(X_trn.max(axis=0)-X_trn.min(axis=0))) 

**Поставим задачу линейной регрессии в виде: A*c=f, где f аппроксимирует наш таргет,
а результатом станет нахождение вектора коэффициентов c (для двух наборов это будут разные 
коэффициенты: c0 и c1)**

In [53]:
A11,A12,A13,A14,A15,A16,A17 = len(y),sum(X[:,0]),        sum(X[:,1]),        sum(X[:,2]),        sum(X[:,3]),        sum(X[:,4]),        sum(X[:,5])
A21,A22,A23,A24,A25,A26,A27 = A12,   sum(X[:,0]*X[:,0]), sum(X[:,1]*X[:,0]), sum(X[:,2]*X[:,0]), sum(X[:,3]*X[:,0]), sum(X[:,4]*X[:,0]), sum(X[:,5]*X[:,0])
A31,A32,A33,A34,A35,A36,A37 = A13,   A23,                sum(X[:,1]*X[:,1]), sum(X[:,2]*X[:,1]), sum(X[:,3]*X[:,1]), sum(X[:,4]*X[:,1]), sum(X[:,5]*X[:,1])
A41,A42,A43,A44,A45,A46,A47 = A14,   A24,  A34,                              sum(X[:,2]*X[:,2]), sum(X[:,3]*X[:,2]), sum(X[:,4]*X[:,2]), sum(X[:,5]*X[:,2])
A51,A52,A53,A54,A55,A56,A57 = A15,   A25,  A35,  A45,                                            sum(X[:,3]*X[:,3]), sum(X[:,4]*X[:,3]), sum(X[:,5]*X[:,3])
A61,A62,A63,A64,A65,A66,A67 = A16,   A26,  A36,  A46,  A56,                                                          sum(X[:,4]*X[:,4]), sum(X[:,5]*X[:,4])
A71,A72,A73,A74,A75,A76,A77 = A17,   A27,  A37,  A47,  A57,  A67,                                                                        sum(X[:,5]*X[:,5])

F1,F2,F3,F4,F5,F6,F7  = sum(y),sum(y*X[:,0]),sum(y*X[:,1]),sum(y*X[:,2]),sum(y*X[:,3]),sum(y*X[:,4]),sum(y*X[:,5]) 

In [54]:
A = np.array([[A11,A12,A13,A14,A15,A16,A17],
              [A21,A22,A23,A24,A25,A26,A27],
              [A31,A32,A33,A34,A35,A36,A37],
              [A41,A42,A43,A44,A45,A46,A47],
              [A51,A52,A53,A54,A55,A56,A57],
              [A61,A62,A63,A64,A65,A66,A67],
              [A71,A72,A73,A74,A75,A76,A77]])
F = np.array([F1,F2,F3,F4,F5,F6,F7])
               

In [55]:
A

array([[7000.        , 1385.2       , 2782.58666667, 1646.33333333,
        2612.        ,  902.        ,  788.        ],
       [1385.2       ,  491.        ,  591.24533333,  386.36666667,
         522.7       ,  176.8       ,  155.2       ],
       [2782.58666667,  591.24533333, 1239.10862222,  853.82666667,
        1037.09333333,  362.36      ,  316.36      ],
       [1646.33333333,  386.36666667,  853.82666667,  869.66666667,
         628.        ,  216.        ,  191.33333333],
       [2612.        ,  522.7       , 1037.09333333,  628.        ,
        2612.        ,  351.        ,  284.        ],
       [ 902.        ,  176.8       ,  362.36      ,  216.        ,
         351.        ,  902.        ,  114.        ],
       [ 788.        ,  155.2       ,  316.36      ,  191.33333333,
         284.        ,  114.        ,  788.        ]])

In [56]:
F

array([449223.        ,  92096.6       , 187847.22666667, 124330.33333333,
       176395.        ,  58627.        ,  51800.        ])

In [57]:
AI = inv(A)

In [58]:
c = inv(A).dot(F)
print(c)

[43.37876848  1.70814663 30.90035    25.55853469  5.1638888   0.5540629
  1.46812707]


In [59]:
yp = c[0] + c[1]*X[:,0] + c[2]*X[:,1] + c[3]*X[:,2] + c[4]*X[:,3] + c[5]*X[:,4] + c[6]*X[:,5]

In [60]:
r2 = r2_quality(yp, y)
print(r2)
# 0.5510971479147218 - вариант 1
# 0.6385190377458905 - вариант 3

0.6386792270203765


In [62]:
print(yp.min(),yp.max())
yp[yp>100] = 100
print(yp.min(),yp.max())

45.46885579350854 104.3784112841845
45.46885579350854 100.0


**тепреь применим найденные коэффициенты к тестовому набору**

In [64]:
df_tst = pd.read_csv(TEST_DATASET_PATH)
df_tst.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
0,10000,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10001,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10002,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10003,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
4,10004,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0


In [65]:
print(df_tst.shape)

(10000, 11)


In [66]:
# в признаках оставим только релевантные
df_X = pd.DataFrame(df_tst, columns=['years_of_experience','lesson_price','qualification','physics','chemistry','biology'])
# и нормализуем эти признаки
X = np.array((df_X-df_X.min(axis=0))/(df_X.max(axis=0)-df_X.min(axis=0))) 

In [71]:
yp = pd.DataFrame(df_tst, columns=['Id','physics'])
yp['physics'] = 0
yp.columns = ['Id','mean_exam_points']
yp['mean_exam_points'] = c[0] + c[1]*X[:,0] + c[2]*X[:,1] + c[3]*X[:,2] + c[4]*X[:,3] + c[5]*X[:,4] + c[6]*X[:,5]

In [72]:
yp.describe()

Unnamed: 0,Id,mean_exam_points
count,10000.0,10000.0
mean,14999.5,63.990244
std,2886.89568,10.812165
min,10000.0,44.403656
25%,12499.75,54.325097
50%,14999.5,61.435828
75%,17499.25,70.988435
max,19999.0,99.836067


In [73]:
yp['mean_exam_points'] = np.rint(yp['mean_exam_points'])

In [74]:
yp.head()

Unnamed: 0,Id,mean_exam_points
0,10000,51.0
1,10001,66.0
2,10002,60.0
3,10003,90.0
4,10004,90.0


In [75]:
yp.to_csv(PREP_DATASET_PATH, index=False)