In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Lexical_Resource.csv')
X = df['question'] + ' ' + df['answer']  
y = df[['Try_to_vary_your_vocabulary_using_accurate_synonyms', 'Use_less_common_question_specific_words_that_accurately_convey_meaning', 'Check_your_work_for_spelling_and_word_formation_mistakes','score']]



In [3]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svm', SVC())
])

In [4]:
# Define hyperparameters to search
param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 'auto']
}

In [5]:
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train['Try_to_vary_your_vocabulary_using_accurate_synonyms'])  



In [6]:
# Print the best parameters
print("Best parameters found:", grid_search.best_params_)

Best parameters found: {'svm__C': 0.1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}


In [7]:
predictions = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test['Try_to_vary_your_vocabulary_using_accurate_synonyms'], predictions))

Classification Report:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00        70

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70



In [8]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'Try_to_vary_your_vocabulary_using_accurate_synonyms.pkl')

['Try_to_vary_your_vocabulary_using_accurate_synonyms.pkl']

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Lexical_Resource.csv')
X = df['question'] + ' ' + df['answer']  
y = df['Use_less_common_question_specific_words_that_accurately_convey_meaning']  

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svm', SVC())
])

param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 'auto']
}

grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found:", grid_search.best_params_)

predictions = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))




Best parameters found: {'svm__C': 0.1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        66
           1       0.00      0.00      0.00         4

    accuracy                           0.94        70
   macro avg       0.47      0.50      0.49        70
weighted avg       0.89      0.94      0.92        70



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'Use_less_common_question_specific_words_that_accurately_convey_meaning.pkl')

['Use_less_common_question_specific_words_that_accurately_convey_meaning.pkl']

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Lexical_Resource.csv')
X = df['question'] + ' ' + df['answer']  
y = df['Check_your_work_for_spelling_and_word_formation_mistakes']  

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svm', SVC())
])

param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 'auto']
}

grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

predictions = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))




Best parameters found: {'svm__C': 10, 'svm__gamma': 0.1, 'svm__kernel': 'rbf', 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2)}
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.48      0.50        29
           1       0.65      0.68      0.67        41

    accuracy                           0.60        70
   macro avg       0.58      0.58      0.58        70
weighted avg       0.60      0.60      0.60        70



In [10]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'Check_your_work_for_spelling_and_word_formation_mistakes.pkl')

['Check_your_work_for_spelling_and_word_formation_mistakes.pkl']

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib

df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Lexical_Resource.csv')
X = df['question'] + ' ' + df['answer']
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# GridSearchCV for SVR
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.1, 1, 'auto'], 'kernel': ['linear', 'rbf', 'poly']}
grid_search = GridSearchCV(SVR(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

final_svm_model = SVR(**best_params)
final_svm_model.fit(X_train_tfidf, y_train)

# Cross-validation scores
cross_val_scores = cross_val_score(final_svm_model, X_train_tfidf, y_train, cv=5)
print(f'Cross-Validation Scores: {cross_val_scores}')

# Prediction and evaluation
y_pred = final_svm_model.predict(X_test_tfidf)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Cross-Validation Scores: [ 0.07041997  0.04729085  0.10999556 -0.03263236  0.08117421]
Mean Squared Error: 0.09726965847535751


In [18]:
joblib.dump(final_svm_model, 'Lexical_Resource_score.joblib')
print('Model saved successfully.')

Model saved successfully.


In [19]:
joblib.dump(vectorizer, 'Lexical_Resource_score_tfidf_vectorizer.joblib')


['Lexical_Resource_score_tfidf_vectorizer.joblib']