# **Контест №1: Линейная регрессия**

**Выполнил:** Ибрагимов Роман Рифхатович

**Группа:** М8О-303Б-22

In [225]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Работа с датасетом train.csv - обучение модели

Загрузим датасет и посмотрим о нём информацию.

In [226]:
data = pd.read_csv('train.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration
0,6622,ecom,6488536.0,,0,82221,0,0,18.450527,0.46175,2020-04-21 16:04:41.817367072
1,2047,information_source,874840.0,21100.0,0,12872,0,0,10.721619,-0.022317,2024-07-19 23:50:07.268931816
2,1118,information_source,571210.0,94707.0,0,0,7420,0,1.922243,0.046396,2024-07-13 16:35:54.794883135
3,4992,news,89534.0,924.0,0,834,0,0,2.149243,-0.09336,2024-09-10 21:29:14.006315095
4,9970,information_source,1043953.0,289288.0,0,58375,20260,3948,3.764965,0.027303,2024-05-26 11:07:15.950527838


In [227]:
data = data.drop_duplicates()

print(f'Размеры датасета: {data.shape}')
print(data.info())

Размеры датасета: (8000, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             8000 non-null   int64  
 1   category               8000 non-null   object 
 2   clicks                 7727 non-null   float64
 3   likes                  7624 non-null   float64
 4   buys                   8000 non-null   int64  
 5   4xx_errors             8000 non-null   int64  
 6   5xx_errors             8000 non-null   int64  
 7   complaints_count       7527 non-null   object 
 8   average_dwelltime      8000 non-null   float64
 9   source_attractiveness  8000 non-null   float64
 10  date_of_registration   8000 non-null   object 
dtypes: float64(4), int64(4), object(3)
memory usage: 687.6+ KB
None


Начнём анализировать датасет. Столбец 'Unnamed: 0' (по сути id каждого сайта) для обучения модели бесполезен, поэтому удалим его.

In [228]:
data = data.drop(columns=['Unnamed: 0'])

Заметим, что столбец 'complaints_counts', который по смыслу является числовым признаком, в датасете имеет тип 'object', поэтому изменим его тип, при этом, если окажутся значения, которые нельзя привести к числовому типу, то вставим вместо них np.nan.

In [229]:
data['complaints_count'] = pd.to_numeric(data['complaints_count'], errors='coerce')

Признак 'date_of_registration' сам по себе полезной информации для обучения не нёсет. Преобразуем его возраст сайта ('age'): из даты регистрации берём год регистрации и вычитаем из текущего года.

In [230]:
year = pd.to_datetime(data['date_of_registration']).dt.year
current_year = pd.to_datetime('today').year
data['age'] = current_year - year
data = data.drop(columns=['date_of_registration'])

Сделав данные преобразования, посмотрим на распределение каждого (числового) признака.

In [231]:
columns = ["clicks", "likes", "buys", "4xx_errors", "5xx_errors", "complaints_count", "average_dwelltime", 'age']

for column in columns:
    fig = px.histogram(data, x=column, title=f"Распределение {column}", template="plotly_dark")
    fig.show()

Можно увидеть, что в признаке 'average_dwelltime' есть отрицательные значения. По смыслу такого быть не может (т.к. это же среднее время проведённое пользователем на домене). Эти значения заменим на np.nan.

In [232]:
data['average_dwelltime'] = data['average_dwelltime'].apply(lambda x: x if x >= 0 else np.nan)

In [233]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               8000 non-null   object 
 1   clicks                 7727 non-null   float64
 2   likes                  7624 non-null   float64
 3   buys                   8000 non-null   int64  
 4   4xx_errors             8000 non-null   int64  
 5   5xx_errors             8000 non-null   int64  
 6   complaints_count       7116 non-null   float64
 7   average_dwelltime      7593 non-null   float64
 8   source_attractiveness  8000 non-null   float64
 9   age                    8000 non-null   int32  
dtypes: float64(5), int32(1), int64(3), object(1)
memory usage: 593.9+ KB
None


В некоторых признаках у нас есть значения np.nan, заменим их на средние по соответствующим столбцам.

In [234]:
for column in ['clicks', 'likes', 'complaints_count', 'average_dwelltime']:
    data[column] = data[column].fillna(data[column].mean())

Построим парные графики для всех признаков (включая таргет).

In [235]:
fig = px.scatter_matrix(
    data,
    dimensions=["clicks", "likes", "buys", "4xx_errors", "5xx_errors", "complaints_count", "average_dwelltime", "source_attractiveness"],
    color="category",
    title="Парные графики для всех признаков",
    labels={column:column for column in data.columns},
    template="plotly_dark"
)

fig.update_layout(width=1500, height=1500)
fig.show()

Теперь давайте посмортим на соотношение всех признаков по каждой категории домена.

In [236]:
category_clicks = data.groupby('category')['clicks'].sum()
category_likes = data.groupby('category')['likes'].sum()
category_buys = data.groupby('category')['buys'].sum()
category_4xx_errors = data.groupby('category')['4xx_errors'].sum()
category_5xx_errors = data.groupby('category')['5xx_errors'].sum()
category_complaints_count = data.groupby('category')['complaints_count'].sum()
category_average_dwelltime = data.groupby('category')['average_dwelltime'].mean()

category_feature = {
    'clicks': category_clicks,
    'likes': category_likes,
    'buys': category_buys,
    '4xx_errors': category_4xx_errors,
    '5xx_errors': category_5xx_errors,
    'complaints_count': category_complaints_count,
    'average_dwelltime': category_average_dwelltime
}

for metric, values in category_feature.items():
    fig = px.bar(x=values.index, y=values.values, title=f"{metric} по категориям", labels={'x':'Категории', 'y':metric}, template="plotly_dark")
    fig.show()

'category' - категориальный признак, для него применим one-hot encoding.

In [237]:
data = pd.get_dummies(data, columns=['category'])
data.head()

Unnamed: 0,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age,category_ecom,category_information_source,category_news,category_porn,category_social
0,6488536.0,598502.743179,0,82221,0,0.0,18.450527,0.46175,4,True,False,False,False,False
1,874840.0,21100.0,0,12872,0,0.0,10.721619,-0.022317,0,False,True,False,False,False
2,571210.0,94707.0,0,0,7420,0.0,1.922243,0.046396,0,False,True,False,False,False
3,89534.0,924.0,0,834,0,0.0,2.149243,-0.09336,0,False,False,True,False,False
4,1043953.0,289288.0,0,58375,20260,3948.0,3.764965,0.027303,0,False,True,False,False,False


Построим матрицу корреляций для признаков и таргета.

In [238]:
corr_matrix = data.corr()

fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale='RdBu',
        zmin=-1, zmax=1
    ))

fig.update_layout(
    title='Correlation Matrix (Including Target)',
    xaxis_nticks=36,
    width=600,
    height=600,
    margin=dict(l=100, r=100, t=100, b=100),
    coloraxis_colorbar=dict(
        title="Correlation"
    )
)

fig.show()

Из вышеприведённых графиков можно понять, что наибольшее влияение на привлекательность домена (наш трагет) оказывают 'clicks', 'likes', 'buys'. Остальные признаки имеют меньшее влияние (и даже отрицательное).

Теперь настал момент придумывания новых фич. Для начала, вспомним, что данные в основном распределены экспонинцеально, поэтому прологарифмируем 'clicks', 'likes', 'buys' (как наиболее значимые признаки). Далее интересно посмотреть на отношения лайков к кликам и покупок к кликам.

In [239]:
data_new_feature = data
data_new_feature['clicks'] = np.log1p(data_new_feature['clicks'])
data_new_feature['likes'] = np.log1p(data_new_feature['likes'])
data_new_feature['buys'] = np.log1p(data_new_feature['buys'])

data_new_feature['likes_clicks_ratio'] = data_new_feature['likes'] / data_new_feature['clicks'] 
data_new_feature['buys_clicks_ratio'] = data_new_feature['buys'] / data_new_feature['clicks'] 
data_new_feature

Unnamed: 0,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age,category_ecom,category_information_source,category_news,category_porn,category_social,likes_clicks_ratio,buys_clicks_ratio
0,15.685548,13.302188,0.000000,82221,0,0.000000,18.450527,0.461750,4,True,False,False,False,False,0.848054,0.000000
1,13.681797,9.957076,0.000000,12872,0,0.000000,10.721619,-0.022317,0,False,True,False,False,False,0.727761,0.000000
2,13.255514,11.458554,0.000000,0,7420,0.000000,1.922243,0.046396,0,False,True,False,False,False,0.864437,0.000000
3,11.402385,6.829794,0.000000,834,0,0.000000,2.149243,-0.093360,0,False,False,True,False,False,0.598979,0.000000
4,13.858526,12.575181,0.000000,58375,20260,3948.000000,3.764965,0.027303,0,False,True,False,False,False,0.907397,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,14.199821,13.070070,0.000000,61307,83928,16841.000000,4.191481,0.017470,1,False,True,False,False,False,0.920439,0.000000
7996,11.442213,8.767018,0.000000,3418,705,0.000000,3.745192,-0.019069,0,False,True,False,False,False,0.766199,0.000000
7997,11.325595,0.000000,0.000000,4302,1718,1007.000000,5.837475,-0.061523,0,False,True,False,False,False,0.000000,0.000000
7998,13.621801,11.568048,0.000000,0,2892,267.000000,11.994037,0.131620,1,False,False,False,False,True,0.849230,0.000000


Приступаем к обучению модели. Выделим таргет из нашего датасета.

In [240]:
X = data_new_feature.drop(columns=['source_attractiveness'])
y = data_new_feature['source_attractiveness']

Непосредственно обучение модели, и оценка качества предсказаний, тут всё просто и очевидно.

In [241]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(f'Train set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R2: {r2}')

Train set size: 6400
Test set size: 1600
MSE: 0.014774157344347793
R2: 0.7140365218419331


## Работа с датасетом test.csv

Загрузим датасет test.csv и проделаем такие же преобразования, что и с датасетом train.csv.

In [242]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,ID,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,date_of_registration
0,0,ecom,73749.0,20167.0,24000,2509,166,800,3.408171,2021-09-08 06:28:32.157046464
1,1,information_source,1814015.0,321069.0,0,0,0,739,7.613713,2024-07-04 23:36:39.276336238
2,2,ecom,11352870.0,5377369.0,7807118,202610,71386,16214,7.747694,2024-02-26 11:39:21.926781108
3,3,ecom,1811896.0,0.0,1340025,11674,0,0,2.882191,2023-11-03 23:52:15.085026428
4,4,ecom,2068968.0,999279.0,187409,45723,7666,3520,8.618482,2023-07-11 11:55:16.739515152
...,...,...,...,...,...,...,...,...,...,...
1995,1995,information_source,,549548.0,0,56779,0,117,1.490243,2020-08-06 17:36:54.802416800
1996,1996,ecom,2718982.0,1141419.0,1056869,81709,0,1427,3.413341,2023-09-24 16:31:06.112630028
1997,1997,information_source,4220069.0,372383.0,0,173104,128346,27947,5.288553,2023-10-22 05:48:58.943180372
1998,1998,ecom,2697468.0,1161756.0,0,41933,14650,4401,12.515275,2023-10-19 13:07:05.077775348


In [243]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    2000 non-null   int64  
 1   category              2000 non-null   object 
 2   clicks                1930 non-null   float64
 3   likes                 1913 non-null   float64
 4   buys                  2000 non-null   int64  
 5   4xx_errors            2000 non-null   int64  
 6   5xx_errors            2000 non-null   int64  
 7   complaints_count      1883 non-null   object 
 8   average_dwelltime     2000 non-null   float64
 9   date_of_registration  2000 non-null   object 
dtypes: float64(3), int64(4), object(3)
memory usage: 156.4+ KB


In [244]:
test['complaints_count'] = pd.to_numeric(test['complaints_count'], errors='coerce')
test['average_dwelltime'] = test['average_dwelltime'].apply(lambda x: x if x >= 0 else np.nan)

year = pd.to_datetime(test['date_of_registration']).dt.year
current_year = pd.to_datetime('today').year
test['age'] = current_year - year
test = test.drop(columns=['date_of_registration'])

for column in ['clicks', 'likes', 'complaints_count', 'average_dwelltime']:
    test[column] = test[column].fillna(test[column].mean())
    
test = test.drop(columns=['ID'])

test = pd.get_dummies(test, columns=['category'])

test['clicks'] = np.log1p(test['clicks'])
test['likes'] = np.log1p(test['likes'])
test['buys'] = np.log1p(test['buys'])

test['likes_clicks_ratio'] = test['likes'] / test['clicks']
test['buys_clicks_ratio'] = test['buys'] / test['clicks']

In [245]:
test

Unnamed: 0,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,age,category_ecom,category_information_source,category_news,category_porn,category_social,likes_clicks_ratio,buys_clicks_ratio
0,11.208436,9.911852,10.085851,2509,166,800.0,3.408171,3,True,False,False,False,False,0.884321,0.899845
1,14.411054,12.679414,0.000000,0,0,739.0,7.613713,0,False,True,False,False,False,0.879840,0.000000
2,16.244981,15.497710,15.870547,202610,71386,16214.0,7.747694,0,True,False,False,False,False,0.954000,0.976951
3,14.409885,0.000000,14.108200,11674,0,0.0,2.882191,1,True,False,False,False,False,0.000000,0.979064
4,14.542561,13.814790,12.141054,45723,7666,3520.0,8.618482,1,True,False,False,False,False,0.949956,0.834864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,14.797369,13.216853,0.000000,56779,0,117.0,1.490243,4,False,True,False,False,False,0.893189,0.000000
1996,14.815768,13.947784,13.870822,81709,0,1427.0,3.413341,1,True,False,False,False,False,0.941415,0.936220
1997,15.255362,12.827681,0.000000,173104,128346,27947.0,5.288553,1,False,True,False,False,False,0.840864,0.000000
1998,14.807824,13.965444,0.000000,41933,14650,4401.0,12.515275,1,True,False,False,False,False,0.943112,0.000000


Получаем предсказания для сабмита в каггл.

In [246]:
submission_predict = model.predict(test)

Формируем сабмит для каггла и всё, конец.

In [247]:
data = {
    "source_attractiveness": submission_predict
}
submit = pd.DataFrame(data)
submit.to_csv('submission.csv', index_label="ID")