In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Загрузка и подготовка данных
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Удаление student_id и заполнение пропусков
df = df.drop('student_id', axis=1)
df = df.fillna(0)

# Разделение на train/val/test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Преобразование в матрицы
dv = DictVectorizer(sparse=True)

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

X_train = dv.fit_transform(df_train.to_dict('records'))
X_val = dv.transform(df_val.to_dict('records'))
X_test = dv.transform(df_test.to_dict('records'))

In [2]:
# Дерево решений с max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Получаем имя признака для разбиения
feature_names = dv.get_feature_names_out()
tree_feature = feature_names[dt.tree_.feature[0]]
print(f"Признак для разбиения: {tree_feature}")

Признак для разбиения: study_hours_per_week


In [3]:
# Случайный лес с n_estimators=10
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 42.14


In [4]:
# Эксперимент с n_estimators
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

    print(f"n_estimators={n}, RMSE={rmse:.3f}")

# Анализ когда RMSE перестает улучшаться
for i in range(1, len(scores)):
    improvement = scores[i-1][1] - scores[i][1]
    if improvement < 0.001:  # порог улучшения
        print(f"RMSE перестает улучшаться после {scores[i-1][0]} estimators")
        break

n_estimators=10, RMSE=42.137
n_estimators=20, RMSE=41.461
n_estimators=30, RMSE=41.106
n_estimators=40, RMSE=40.917
n_estimators=50, RMSE=40.852
n_estimators=60, RMSE=40.784
n_estimators=70, RMSE=40.677
n_estimators=80, RMSE=40.539
n_estimators=90, RMSE=40.504
n_estimators=100, RMSE=40.517
n_estimators=110, RMSE=40.593
n_estimators=120, RMSE=40.625
n_estimators=130, RMSE=40.651
n_estimators=140, RMSE=40.595
n_estimators=150, RMSE=40.597
n_estimators=160, RMSE=40.604
n_estimators=170, RMSE=40.628
n_estimators=180, RMSE=40.641
n_estimators=190, RMSE=40.631
n_estimators=200, RMSE=40.601
RMSE перестает улучшаться после 90 estimators


In [6]:
best_depth = None
best_rmse = float('inf')

for depth in [10, 15, 20, 25]:
    rmse_scores = []
    rf = RandomForestRegressor(
        n_estimators=10,
        max_depth=depth,
        random_state=1,
        n_jobs=-1,
        warm_start=True
    )

    for n in range(10, 201, 10):
        rf.n_estimators = n
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    mean_rmse = np.mean(rmse_scores)
    print(f"max_depth={depth}, средний RMSE={mean_rmse:.3f}")

    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_depth = depth

print(f"Лучший max_depth: {best_depth}")

max_depth=10, средний RMSE=40.392
max_depth=15, средний RMSE=40.735
max_depth=20, средний RMSE=40.740
max_depth=25, средний RMSE=40.788
Лучший max_depth: 10


In [8]:
# Важность признаков
rf_final = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_final.fit(X_train, y_train)

# Получаем важность признаков
feature_importances = pd.DataFrame({
    'feature': dv.get_feature_names_out(),
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importances.head(10))



                             feature  importance
27              study_hours_per_week    0.248354
4                    attendance_rate    0.149729
5                 distance_to_school    0.136486
28                   teacher_quality    0.082682
2                                age    0.069311
3              assignments_completed    0.031517
24         socioeconomic_status=High    0.025714
17           parent_involvement=High    0.022919
10                 it_knowledge=High    0.017719
15  parent_education_level=Secondary    0.016957


Вопрос 1: Признак для разбиения: study_hours_per_week

Вопрос 2: RMSE: 42.13 (42.14)

Вопрос 3: 80

Вопрос 4: Лучший max_depth - 10

Вопрос 5: study_hours_per_week