In [50]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

df = pd.read_csv("Resume_scores_with_category.csv")  

df['clean_text'] = df['Cleaned_Resume'].fillna('')

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['clean_text'])

y = df['overall_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

joblib.dump(model, 'random_forest_resume_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Модель и векторизатор сохранены.")


RMSE: 0.1424
R2 Score: 0.3912
Модель и векторизатор сохранены.
