Read in the data

In [2]:
import pandas as pd
from google.colab import drive

drive.mount('/content/gdrive')
df = pd.read_csv("gdrive/My Drive/Dissertation Complete/RobertA_base_results.csv")

df.tail()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,text,university,RobertA_score
444617,and why did Cornell have to put up nets under ...,Cornell,0.513806
444618,Because Cornell has lots of bridges.,Cornell,0.512402
444619,When a lifelong friend of my brother was an ...,MIT,0.508957
444620,Heard from a Yalie who said Princeton's partie...,Princeton,0.514205
444621,Absolutely most top schools have their fair sh...,Columbia,0.513273


Create a target for the SVM based on how positive the sentiment score is

In [3]:
df['Rank'] = df['RobertA_score'].rank(ascending=False)
df.head()

Unnamed: 0,text,university,RobertA_score,Rank
0,"Hey, Thanks so much for the response! I actual...",UNC,0.549539,68012.0
1,There are a lot of great choices. I'll throw o...,Wake Forest,0.549876,66834.0
2,"Okay thanks, I'd never even heard of Wake Fore...",Wake Forest,0.55248,61867.0
3,"""is the stereotype that USC is party school fo...",USC,0.552015,62418.0
4,"Hi ! First of all, I think your choice should ...",USC,0.552359,62008.0


split the dataset, using the model score as an input and the degree of sentiment as a target

In [4]:
from sklearn.model_selection import train_test_split

X = df['RobertA_score'].values.reshape(-1, 1)
y = df['Rank'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Create the Linear SVM model and fit it to the data

In [5]:
from sklearn.svm import SVR

# Create the SVM regressor
svm_regressor = SVR(kernel='linear')  # You can choose different kernels (e.g., 'linear', 'poly', 'rbf', etc.)

# Fit the model to the training data
svm_regressor.fit(X_train, y_train)

predicted_ranks = svm_regressor.predict(X_test)

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, predicted_ranks)
mse = mean_squared_error(y_test, predicted_ranks)
r2 = r2_score(y_test, predicted_ranks)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R-squared: {r2}')

MAE: 111185.00465241612
MSE: 16482863680.005522
R-squared: 0.0012320587879010025


Predict the ranking of the sentiment scores

In [7]:
df.drop(columns=["Rank"], axis=1, inplace=True)

# Predict ranks based on the sentiment scores
predicted_ranks = svm_regressor.predict(df['RobertA_score'].values.reshape(-1, 1))

# Append the predicted ranks to the DataFrame
df['Predicted_rank'] = predicted_ranks

# Sort the DataFrame by the predicted ranks in ascending order (lower ranks are better)
df = df.sort_values(by='Predicted_rank', ascending=True)

# Reset the index to have a clean ranking
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,text,university,RobertA_score,Predicted_rank
0,Ya honestly it depends on the interviewer. Peo...,Duke,0.582699,222076.98033
1,How to get into Harvard: do what you are passi...,Harvard,0.582248,222079.104335
2,: I am sorry that you feel this way--undergr...,Georgetown,0.582227,222079.200971
3,You clearly seem like you would be a much bett...,Stanford,0.582041,222080.079114
4,If they find out you applied elsewhere during ...,Princeton,0.581978,222080.374357


Write the results to a new file

In [8]:
from google.colab import drive

drive.mount('/content/gdrive')
path = '/content/gdrive/My Drive/Dissertation Complete/RobertA_SVM_ranking_results.csv'

with open(path, 'w') as f:
  df.to_csv(f, encoding='utf-8', index=False, header=True)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
df.tail()

Unnamed: 0,text,university,RobertA_score,Predicted_rank
444617,"Well, can't find it. But College P r ow ler h...",Notre Dame,0.503288,222451.242422
444618,"at each school, which gives you the best retu...",MIT,0.502849,222453.309683
444619,I just hope when I take that nap dreams about...,Penn,0.502463,222455.128052
444620,1. Notre Dame 2 2. Vanderbilt 1 3. Duke 1 4. B...,UCLA,0.502261,222456.084009
444621,"[size=""+2""] [/size] * SAT Verbal: 730 * SAT ...",UNC,0.501669,222458.873785
