In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Grammatical_Range.csv')
X = df['question'] + ' ' + df['answer']  
y = df['Use_a_variety_of_complex_and_simple_sentences']  

# Use LabelEncoder for multi-class classification
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svm', SVC())
])

param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 'auto']
}

grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

predictions = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))


Best parameters found: {'svm__C': 0.1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        70

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70



In [15]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'Use_a_variety_of_complex_and_simple_sentences.pkl')

['Use_a_variety_of_complex_and_simple_sentences.pkl']

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Grammatical_Range.csv')
X = df['question'] + ' ' + df['answer']  
y = df['Check_your_writing_for_errors']  

# Use LabelEncoder for multi-class classification
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svm', SVC())
])

param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 'auto']
}

grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

predictions = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))


Best parameters found: {'svm__C': 10, 'svm__gamma': 0.1, 'svm__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.47      0.55        17
           1       0.84      0.92      0.88        53

    accuracy                           0.81        70
   macro avg       0.76      0.70      0.72        70
weighted avg       0.80      0.81      0.80        70



In [17]:
import joblib
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'Check_your_writing_for_errors.pkl')

['Check_your_writing_for_errors.pkl']

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib

df = pd.read_csv('/Users/abdullahalsakib/Downloads/ielts/Ml_part/Grammatical_Range.csv')
X = df['question'] + ' ' + df['answer']
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# GridSearchCV for SVR
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.1, 1, 'auto'], 'kernel': ['linear', 'rbf', 'poly']}
grid_search = GridSearchCV(SVR(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

final_svm_model = SVR(**best_params)
final_svm_model.fit(X_train_tfidf, y_train)

# Cross-validation scores
cross_val_scores = cross_val_score(final_svm_model, X_train_tfidf, y_train, cv=5)
print(f'Cross-Validation Scores: {cross_val_scores}')

# Prediction and evaluation
y_pred = final_svm_model.predict(X_test_tfidf)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Cross-Validation Scores: [ 0.09694739  0.16128137  0.07061384  0.06434243 -0.13185831]
Mean Squared Error: 0.08827159156663184


In [19]:
joblib.dump(final_svm_model, 'Grammatical_Range_score.joblib')
print('Model saved successfully.')

Model saved successfully.


In [20]:
joblib.dump(vectorizer, 'Grammatical_Range_score_tfidf_vectorizer.joblib')


['Grammatical_Range_score_tfidf_vectorizer.joblib']