**Подключение библиотек и скриптов**

In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from numpy.linalg import inv

In [145]:
def r2_quality(y_prd, y_tru):
    y_mn = np.average(y_tru,axis=0)
    return 1.0 - np.sum((y_prd - y_tru)**2)/np.sum((y_tru - y_mn)**2)

**Пути к директориям и файлам**

In [146]:
# input
TRAIN_DATASET_PATH = 'train.csv'
TEST_DATASET_PATH  = 'test.csv'

# output
PREP_DATASET_PATH  = 'GB_predictions.csv'

**Загрузка  данных<a class="anchor" id="load_data"></a>**

In [147]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,0,40.0,0.0,1400.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,63.0
1,1,48.0,4.0,2850.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,86.0
2,2,39.0,0.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
3,3,46.0,5.0,1400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
4,4,43.0,1.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0


**Подготовка данных**

In [148]:
# разделим набор по значению столбца "физика"
df0 = df[df['physics']==0]
df1 = df[df['physics']==1]
print(df0.shape, df1.shape)

(6250, 12) (3750, 12)


In [149]:
# отделим таргеты
y0 = df0['mean_exam_points']
y1 = df1['mean_exam_points']

In [150]:
# в признаках оставим только релевантные, но из двух коррелирующих признаков qualification и 
# lesson_price выберем вторую, т.к. она более подробная
df_X0 = pd.DataFrame(df0, columns=['years_of_experience','lesson_price'])
df_X1 = pd.DataFrame(df1, columns=['years_of_experience','lesson_price'])

# и нормализуем эти признаки
X0 = np.array((df_X0-df_X0.min(axis=0))/(df_X0.max(axis=0)-df_X0.min(axis=0))) 
X1 = np.array((df_X1-df_X1.min(axis=0))/(df_X1.max(axis=0)-df_X1.min(axis=0))) 

**Поставим задачу линейной регрессии в виде: A*c=f, где f аппроксимирует наш таргет,
а результатом станет нахождение вектора коэффициентов c (для двух наборов это будут разные 
коэффициенты: c0 и c1) 
Во втором варианте применим полиномиальную регрессию (до 2 степени)**

In [182]:
A0_11,A0_12,A0_13,A0_14,A0_15,A0_16 = len(y0),sum(X0[:,0]),sum(X0[:,1]), sum(X0[:,0]**2), sum(X0[:,1]**2), sum(X0[:,0]*X0[:,1])
A0_21,A0_22,A0_23,A0_24,A0_25,A0_26 =   A0_12, A0_14, A0_16, sum(X0[:,0]**3), sum(X0[:,0]*X0[:,1]**2),  sum(X0[:,1]*X0[:,0]**2)
A0_31,A0_32,A0_33,A0_34,A0_35,A0_36 =   A0_13, A0_23, A0_15, A0_26,  sum(X0[:,1]**3), A0_25
A0_41,A0_42,A0_43,A0_44,A0_45,A0_46 =   A0_14, A0_24, A0_34, sum(X0[:,0]**4),  sum(X0[:,0]**2*X0[:,1]**2), sum(X0[:,0]**3*X0[:,1])
A0_51,A0_52,A0_53,A0_54,A0_55,A0_56 =   A0_15, A0_25, A0_35, A0_45,  sum(X0[:,1]**4), sum(X0[:,1]**3*X0[:,0])
A0_61,A0_62,A0_63,A0_64,A0_65,A0_66 =   A0_16, A0_26, A0_36, A0_46, A0_56, A0_45

F0_1,F0_2,F0_3,F0_4,F0_5,F0_6       = sum(y0),sum(y0*X0[:,0]),sum(y0*X0[:,1]),sum(y0*X0[:,0]**2),sum(y0*X0[:,1]**2),sum(y0*X0[:,0]*X0[:,1])

In [185]:
A0 = np.array([[A0_11,A0_12,A0_13,A0_14,A0_15,A0_16],[A0_21,A0_22,A0_23,A0_24,A0_25,A0_26],
               [A0_31,A0_32,A0_33,A0_34,A0_35,A0_36],[A0_41,A0_42,A0_43,A0_44,A0_45,A0_46],
               [A0_51,A0_52,A0_53,A0_54,A0_55,A0_56],[A0_61,A0_62,A0_63,A0_64,A0_65,A0_66]])
F0 = np.array([F0_1,F0_2,F0_3,F0_4,F0_5,F0_6])

In [186]:
A0

array([[6250.        , 1543.125     , 2502.86666667,  683.859375  ,
        1125.63715556,  667.27333333],
       [1543.125     ,  683.859375  ,  667.27333333,  360.14648438,
         333.26608889,  307.00166667],
       [2502.86666667,  667.27333333, 1125.63715556,  307.00166667,
         564.74217007,  333.26608889],
       [ 683.859375  ,  360.14648438,  307.00166667,  213.56323242,
         160.51941667,  165.51989583],
       [1125.63715556,  333.26608889,  564.74217007,  160.51941667,
         311.75939126,  186.93265659],
       [ 667.27333333,  307.00166667,  333.26608889,  165.51989583,
         186.93265659,  160.51941667]])

In [187]:
F0

array([389828.        , 100015.875     , 164786.68      ,  45224.296875  ,
        78354.48408889,  46173.92833333])

In [188]:
AI0 = inv(A0)

In [189]:
c0 = inv(A0).dot(F0)

In [190]:
c0

array([ 38.70300604,  15.6570406 ,  34.40386532,   6.33740604,
        59.52325844, -50.45957988])

In [191]:
y0p = c0[0] + c0[1]*X0[:,0] + c0[2]*X0[:,1] + c0[3]*X0[:,0]**2 + c0[4]*X0[:,1]**2 + c0[5]*X0[:,0]*X0[:,1]

In [192]:
r2_0 = r2_quality(y0p, y0)
print(r2_0)

0.5674707440887274


In [193]:
A1_11,A1_12,A1_13,A1_14,A1_15,A1_16 = len(y1),sum(X1[:,0]),sum(X1[:,1]), sum(X1[:,0]**2), sum(X1[:,1]**2), sum(X1[:,0]*X1[:,1])
A1_21,A1_22,A1_23,A1_24,A1_25,A1_26 =   A1_12, A1_14, A1_16, sum(X1[:,0]**3), sum(X1[:,0]*X1[:,1]**2),  sum(X1[:,1]*X1[:,0]**2)
A1_31,A1_32,A1_33,A1_34,A1_35,A1_36 =   A1_13, A1_23, A1_15, A1_26,  sum(X1[:,1]**3), A1_25
A1_41,A1_42,A1_43,A1_44,A1_45,A1_46 =   A1_14, A1_24, A1_34, sum(X1[:,0]**4),  sum(X1[:,0]**2*X1[:,1]**2), sum(X1[:,0]**3*X1[:,1])
A1_51,A1_52,A1_53,A1_54,A1_55,A1_56 =   A1_15, A1_25, A1_35, A1_45,  sum(X1[:,1]**4), sum(X1[:,1]**3*X1[:,0])
A1_61,A1_62,A1_63,A1_64,A1_65,A1_66 =   A1_16, A1_26, A1_36, A1_46, A1_56, A1_45

F1_1,F1_2,F1_3,F1_4,F1_5,F1_6       = sum(y1),sum(y1*X1[:,0]),sum(y1*X1[:,1]),sum(y1*X1[:,0]**2),sum(y1*X1[:,1]**2),sum(y1*X1[:,0]*X1[:,1])

In [194]:
A1 = np.array([[A1_11,A1_12,A1_13,A1_14,A1_15,A1_16],[A1_21,A1_22,A1_23,A1_24,A1_25,A1_26],
               [A1_31,A1_32,A1_33,A1_34,A1_35,A1_36],[A1_41,A1_42,A1_43,A1_44,A1_45,A1_46],
               [A1_51,A1_52,A1_53,A1_54,A1_55,A1_56],[A1_61,A1_62,A1_63,A1_64,A1_65,A1_66]])
F1 = np.array([F1_1,F1_2,F1_3,F1_4,F1_5,F1_6])

In [195]:
A1

array([[3750.        ,  752.3       , 1557.02777778,  271.11      ,
         725.20601852,  335.43333333],
       [ 752.3       ,  271.11      ,  335.43333333,  117.395     ,
         172.97318673,  126.16083333],
       [1557.02777778,  335.43333333,  725.20601852,  126.16083333,
         375.66464656,  172.97318673],
       [ 271.11      ,  117.395     ,  126.16083333,   58.0575    ,
          68.60635802,   56.47383333],
       [ 725.20601852,  172.97318673,  375.66464656,   68.60635802,
         213.50065506,  100.12411426],
       [ 335.43333333,  126.16083333,  172.97318673,   56.47383333,
         100.12411426,   68.60635802]])

In [196]:
F1

array([253580.        ,  52747.1       , 110506.36111111,  19421.31      ,
        54086.43094136,  24924.88472222])

In [197]:
AI1 = inv(A1)

In [198]:
c1 = inv(A1).dot(F1)

In [199]:
c1

array([ 44.48439561,  24.83240678,  28.08723489,   8.56542714,
        61.75721224, -67.85068982])

In [200]:
y1p = c1[0] + c1[1]*X1[:,0] + c1[2]*X1[:,1] + c1[3]*X1[:,0]**2 + c1[4]*X1[:,1]**2 + c1[5]*X1[:,0]*X1[:,1]

In [201]:
r2_1 = r2_quality(y1p, y1)
print(r2_1)

0.548837774014928


In [202]:
print(y0p.min(),y0p.max())
print(y1p.min(),y1p.max())

47.48948114174469 110.09301622957048
53.4528620045912 111.60142669141268


**тепреь применим найденные коэффициенты к тестовому набору**

In [203]:
df_tst = pd.read_csv(TEST_DATASET_PATH)
df_tst.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
0,10000,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10001,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10002,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10003,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
4,10004,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0


In [204]:
print(df_tst.shape)

(10000, 11)


In [205]:
# в признаках оставим только релевантные, но из двух коррелирующих признаков qualification и 
# lesson_price выберем вторую, т.к. она более подробная
df_X = pd.DataFrame(df_tst, columns=['years_of_experience','lesson_price','physics'])

# и нормализуем эти признаки
#df_XN = df_X.copy()
df_XN = (df_X-df_X.min(axis=0))/(df_X.max(axis=0)-df_X.min(axis=0))
df_XN.head()

Unnamed: 0,years_of_experience,lesson_price,physics
0,0.3,0.220779,0.0
1,0.3,0.428571,0.0
2,0.1,0.350649,1.0
3,0.6,0.701299,1.0
4,0.4,0.766234,1.0


**заведем вектор ответа, инициализировав его нулями**

In [206]:
yp = pd.DataFrame(df_tst, columns=['Id','physics'])
yp['physics'] = 0
yp.columns = ['Id','mean_exam_points']

In [210]:
i = df_XN['physics']==0
yp.loc[i,'mean_exam_points']= c0[0] + df_XN[i]['years_of_experience']*c0[1] + df_XN[i]['lesson_price']*c0[2] +     df_XN[i]['years_of_experience']**2*c0[3] + df_XN[i]['lesson_price']**2*c0[4] + df_XN[i]['lesson_price']*df_XN[i]['years_of_experience']*c0[5] 

In [211]:
i = df_XN['physics']==1
yp.loc[i,'mean_exam_points']= c1[0] + df_XN[i]['years_of_experience']*c1[1] + df_XN[i]['lesson_price']*c1[2] + df_XN[i]['years_of_experience']**2*c1[3] + df_XN[i]['lesson_price']**2*c1[4] + df_XN[i]['lesson_price']*df_XN[i]['years_of_experience']*c1[5] 

In [212]:
yp.loc[yp['mean_exam_points']>100, 'mean_exam_points'] = 100

In [213]:
yp.max()

Id                  19999.0
mean_exam_points      100.0
dtype: float64

In [214]:
yp.head()

Unnamed: 0,Id,mean_exam_points
0,10000,51.125385
1,10001,63.160182
2,10002,62.116237
3,10003,83.988198
4,10004,92.771954


In [215]:
yp['mean_exam_points'] = np.rint(yp['mean_exam_points'])

In [216]:
yp.head()

Unnamed: 0,Id,mean_exam_points
0,10000,51.0
1,10001,63.0
2,10002,62.0
3,10003,84.0
4,10004,93.0


In [217]:
yp.to_csv(PREP_DATASET_PATH, index=False)