**Подключение библиотек и скриптов**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from numpy.linalg import inv

In [2]:
def r2_quality(y_prd, y_tru):
    y_mn = np.average(y_tru,axis=0)
    return 1.0 - np.sum((y_prd - y_tru)**2)/np.sum((y_tru - y_mn)**2)

**Пути к директориям и файлам**

In [3]:
# input
TRAIN_DATASET_PATH = 'train.csv'
TEST_DATASET_PATH  = 'test.csv'

# output
PREP_DATASET_PATH  = 'GB_predictions.csv'

**Загрузка  данных<a class="anchor" id="load_data"></a>**

In [4]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,0,40.0,0.0,1400.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,63.0
1,1,48.0,4.0,2850.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,86.0
2,2,39.0,0.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
3,3,46.0,5.0,1400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
4,4,43.0,1.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0


**Подготовка данных**

In [5]:
# разделим набор по значению столбца "физика"
df0 = df[df['physics']==0]
df1 = df[df['physics']==1]
print(df0.shape, df1.shape)

(6250, 12) (3750, 12)


In [6]:
# отделим таргеты
y0 = df0['mean_exam_points']
y1 = df1['mean_exam_points']

In [7]:
# в признаках оставим только релевантные, но из двух коррелирующих признаков qualification и 
# lesson_price теперь выберем оба, может что улучшится
df_X0 = pd.DataFrame(df0, columns=['years_of_experience','lesson_price','qualification'])
df_X1 = pd.DataFrame(df1, columns=['years_of_experience','lesson_price','qualification'])

# и нормализуем эти признаки
X0 = np.array((df_X0-df_X0.min(axis=0))/(df_X0.max(axis=0)-df_X0.min(axis=0))) 
X1 = np.array((df_X1-df_X1.min(axis=0))/(df_X1.max(axis=0)-df_X1.min(axis=0))) 

**Поставим задачу линейной регрессии в виде: A*c=f, где f аппроксимирует наш таргет,
а результатом станет нахождение вектора коэффициентов c (для двух наборов это будут разные 
коэффициенты: c0 и c1)**

In [8]:
A0_11,A0_12,A0_13,A0_14 = len(y0), sum(X0[:,0]),        sum(X0[:,1]),        sum(X0[:,2])
A0_21,A0_22,A0_23,A0_24 = A0_12,   sum(X0[:,0]*X0[:,0]),sum(X0[:,1]*X0[:,0]),sum(X0[:,2]*X0[:,0])
A0_31,A0_32,A0_33,A0_34 = A0_13,   A0_23,               sum(X0[:,1]*X0[:,1]),sum(X0[:,2]*X0[:,1])
A0_41,A0_42,A0_43,A0_44 = A0_14,   A0_24,               A0_34,               sum(X0[:,2]*X0[:,2])

F0_1,F0_2,F0_3,F0_4  = sum(y0),sum(y0*X0[:,0]),sum(y0*X0[:,1]),sum(y0*X0[:,2])

In [9]:
A0 = np.array([[A0_11,A0_12,A0_13,A0_14],[A0_21,A0_22,A0_23,A0_24],[A0_31,A0_32,A0_33,A0_34],[A0_41,A0_42,A0_43,A0_44]])
F0 = np.array([F0_1,F0_2,F0_3,F0_4])

In [10]:
A0

array([[6250.        , 1543.125     , 2502.86666667, 1489.33333333],
       [1543.125     ,  683.859375  ,  667.27333333,  438.45833333],
       [2502.86666667,  667.27333333, 1125.63715556,  781.62222222],
       [1489.33333333,  438.45833333,  781.62222222,  792.88888889]])

In [11]:
F0

array([389828.        , 100015.875     , 164786.68      , 109853.66666667])

In [12]:
AI0 = inv(A0)

In [13]:
c0 = inv(A0).dot(F0)

In [14]:
c0

array([43.02192282,  1.34072205, 32.84578022, 24.61737166])

In [15]:
y0p = c0[0] + c0[1]*X0[:,0] + c0[2]*X0[:,1] + c0[3]*X0[:,2]

In [16]:
r2_0 = r2_quality(y0p, y0)
print(r2_0)
# 0.5510971479147218 - вариант 1
# 0.6385190377458905 - вариант 3

0.6385190377458905


In [17]:
A1_11,A1_12,A1_13,A1_14 = len(y1), sum(X1[:,0]),        sum(X1[:,1]),        sum(X1[:,2])
A1_21,A1_22,A1_23,A1_24 = A1_12,   sum(X1[:,0]*X1[:,0]),sum(X1[:,1]*X1[:,0]),sum(X1[:,2]*X1[:,0])
A1_31,A1_32,A1_33,A1_34 = A1_13,   A1_23,               sum(X1[:,1]*X1[:,1]),sum(X1[:,2]*X1[:,1])
A1_41,A1_42,A1_43,A1_44 = A1_14,   A1_24,               A1_34,               sum(X1[:,2]*X1[:,2])

F1_1,F1_2,F1_3,F1_4  = sum(y1),sum(y1*X1[:,0]),sum(y1*X1[:,1]),sum(y1*X1[:,2])

In [18]:
A1 = np.array([[A1_11,A1_12,A1_13,A1_14],[A1_21,A1_22,A1_23,A1_24],[A1_31,A1_32,A1_33,A1_34],[A1_41,A1_42,A1_43,A1_44]])
F1 = np.array([F1_1,F1_2,F1_3,F1_4])

In [19]:
A1

array([[3750.        ,  752.3       , 1557.02777778,  909.        ],
       [ 752.3       ,  271.11      ,  335.43333333,  216.56666667],
       [1557.02777778,  335.43333333,  725.20601852,  488.70833333],
       [ 909.        ,  216.56666667,  488.70833333,  479.66666667]])

In [20]:
F1

array([253580.        ,  52747.1       , 110506.36111111,  71479.33333333])

In [21]:
AI1 = inv(A1)

In [22]:
c1 = inv(A1).dot(F1)

In [23]:
c1

array([48.77158725,  2.68972128, 29.04384767, 25.78767093])

In [24]:
y1p = c1[0] + c1[1]*X1[:,0] + c1[2]*X1[:,1] + c1[3]*X1[:,2]

In [25]:
r2_1 = r2_quality(y1p, y1)
print(r2_1)

0.6288477829555816


In [26]:
print(y0p.min(),y0p.max())
print(y1p.min(),y1p.max())

44.02746436031029 93.28482568263811
50.116447893653 104.81355185418067


**тепреь применим найденные коэффициенты к тестовому набору**

In [27]:
df_tst = pd.read_csv(TEST_DATASET_PATH)
df_tst.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
0,10000,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10001,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10002,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10003,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
4,10004,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0


In [28]:
print(df_tst.shape)

(10000, 11)


In [30]:
# в признаках оставим только релевантные, но из двух коррелирующих признаков qualification и 
# lesson_price выберем вторую, т.к. она более подробная
df_X = pd.DataFrame(df_tst, columns=['years_of_experience','lesson_price','qualification','physics'])

# и нормализуем эти признаки
#df_XN = df_X.copy()
df_XN = (df_X-df_X.min(axis=0))/(df_X.max(axis=0)-df_X.min(axis=0))
df_XN.head()

Unnamed: 0,years_of_experience,lesson_price,qualification,physics
0,0.3,0.220779,0.0,0.0
1,0.3,0.428571,0.333333,0.0
2,0.1,0.350649,0.0,1.0
3,0.6,0.701299,0.666667,1.0
4,0.4,0.766234,0.666667,1.0


**заведем вектор ответа, инициализировав его нулями**

In [31]:
yp = pd.DataFrame(df_tst, columns=['Id','physics'])
yp['physics'] = 0
yp.columns = ['Id','mean_exam_points']

In [32]:
i = df_XN['physics']==0
yp.loc[i,'mean_exam_points']= c0[0] + df_XN[i]['years_of_experience']*c0[1] + df_XN[i]['lesson_price']*c0[2]  + df_XN[i]['qualification']*c0[3]

In [33]:
i = df_XN['physics']==1
yp.loc[i,'mean_exam_points'] = c1[0] + df_XN[i]['years_of_experience']*c1[1] + df_XN[i]['lesson_price']*c1[2]  + df_XN[i]['qualification']*c1[3]

In [34]:
yp.loc[yp['mean_exam_points']>100, 'mean_exam_points'] = 100

In [35]:
yp.max()

Id                  19999.000000
mean_exam_points       99.290074
dtype: float64

In [36]:
yp.head()

Unnamed: 0,Id,mean_exam_points
0,10000,50.675805
1,10001,65.706693
2,10002,59.224766
3,10003,87.945613
4,10004,89.293633


In [37]:
yp['mean_exam_points'] = np.rint(yp['mean_exam_points'])

In [38]:
yp.head()

Unnamed: 0,Id,mean_exam_points
0,10000,51.0
1,10001,66.0
2,10002,59.0
3,10003,88.0
4,10004,89.0


In [39]:
yp.to_csv(PREP_DATASET_PATH, index=False)

In [None]:
sc= {-2.., 0.55474 ,0.56892} - > 0.65167