In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR  # SVR for regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import joblib

# Load the dataset
df = pd.read_csv('Coherence_and_Cohesion.csv')

# Concatenate 'question' and 'answer' columns to form the input features
X = df['question'] + ' ' + df['answer']

# Convert 'Include_an_introduction_and_conclusion' to numeric values
df['Include_an_introduction_and_conclusion'] = df['Include_an_introduction_and_conclusion'].map({'No': 0, 'Yes': 1})

# Extract target variable for 'Include_an_introduction_and_conclusion' column
y = df['Include_an_introduction_and_conclusion']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline for regression
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('svm', SVR())  # SVR for regression
])

# Define hyperparameters to search
param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],
    'svm__gamma': [0.1, 1, 'auto']
}

# Train the model
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)  # Use entire 'Include_an_introduction_and_conclusion' column

# Print the best parameters
print("Best parameters found:", grid_search.best_params_)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Evaluate the regression model (for example, using mean squared error)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Save the best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'svm_model_Introduction_and_Conclusion_regression.pkl')

# Load the model
loaded_model = joblib.load('svm_model_Introduction_and_Conclusion_regression.pkl')

# Take individual inputs
input_question = input("Levels of youth crime are increasing rapidly in most cities around the world. What are the reasons for this, and suggest some solutions.")
input_answer = input("Nowadays crime rate in young generation is escalating worldwide in most of the cities. This could be due to unemployment and parents’ negligence towards their children and this would be best solved by taking some efforts such as government should provide more employment opportunities for graduates and parents should spend more time with their children.The first primitive reason behind crime of adolescents is scarcity of jobs. Even though youngsters have degrees, certificates of universities but due to unavailability of jobs they did not get jobs which further forces them to choose wrong path. For example, if youth did not get job in their required field, they will try other ways to earn money like gambling, murders. Secondly, parents are busy in their hectic schedule that the do not have time to spend with their children. Children feel neglected due to the fact that they started doing things which are harmful for their future. However, without counselling and guidance they opt wrong path which would be devastated for their future.To combat the problem of crime, government have to take steps to eliminate the issue of unemployment. In other words, there should be some part time jobs available for youngsters so that their mind does not get distracted after completion of their studies. While parents or guardians of children should spend some time in a week so that children will not feel alone and share their feelings and concerns with them. Thus, it increases bonding between parents and children, and they will think twice before doing any harmful activity,To conclude, it is a joint effort of government and parents to make the next generation in good standing so that they will increase the economy of world instead of making it a strain on the world.")


# Merge the input strings
merged_input = input_question + ' ' + input_answer

# Make predictions using the loaded regression model
prediction = loaded_model.predict([merged_input])

print("Predicted output (score):", prediction)


Best parameters found: {'svm__C': 1, 'svm__gamma': 1, 'svm__kernel': 'rbf', 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2)}
Mean Squared Error: 0.24061093764278943
Predicted output (score): [0.51029673]
