In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from nltk.stem import WordNetLemmatizer

In [2]:
import matplotlib.pyplot as plt

In [3]:
val_data = pd.read_csv('val.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data.head()

Unnamed: 0,Phrase,Sentiment
0,"I mean, who needs sleep when you can have the ...",-100
1,"I'm not sure what's more disturbing, the fact ...",-100
2,a neon green jumpsuit with plaid pants. I mean...,-100
3,"It's just 'good luck, shut up",-100
4,"clearly, i'm a functioning adul",1


In [5]:
test_data.head()

Unnamed: 0,Phrase,PhraseID
0,"I woke up, got out of bed, and managed to put ...",0
1,The human nose can detect over 1 trillion diff...,1
2,"Every pixel tells a story, every brushstroke a...",2
3,"Like, I'm a busy person, okay? I have a life, ...",3
4,The smell of old books and coffee fills my lun...,4


# Pre-Processing

In [6]:
train_data['Phrase'] = train_data['Phrase'].str.lower()
train_data['Phrase'] = train_data['Phrase'].str.replace(r'[^\w\s]', '', regex=True)
train_data['Phrase'] = train_data['Phrase'].str.replace(r'http\S+', '', regex=True)
train_data = train_data.dropna()


lemmatizer = WordNetLemmatizer()


def lemmatize_phrase(phrase):
    if isinstance(phrase, str):
        return ' '.join([lemmatizer.lemmatize(word) for word in phrase.split()])
    return phrase 

train_data['Phrase'] = train_data['Phrase'].apply(lemmatize_phrase)


In [7]:
val_data['Phrase'] = val_data['Phrase'].str.lower()
val_data['Phrase'] = val_data['Phrase'].str.replace(r'[^\w\s]', '', regex=True)
val_data['Phrase'] = val_data['Phrase'].str.replace(r'http\S+', '', regex=True)
val_data = val_data.dropna()

lemmatizer = WordNetLemmatizer()


def lemmatize_phrase(phrase):
    if isinstance(phrase, str):  
        return ' '.join([lemmatizer.lemmatize(word) for word in phrase.split()])
    return phrase  

val_data['Phrase'] = val_data['Phrase'].apply(lemmatize_phrase)

In [8]:
test_data['Phrase'] = test_data['Phrase'].str.lower()
test_data['Phrase'] = test_data['Phrase'].str.replace(r'[^\w\s]', '', regex=True)
test_data['Phrase'] = test_data['Phrase'].str.replace(r'http\S+', '', regex=True)
test_data = test_data.dropna()

lemmatizer = WordNetLemmatizer()

def lemmatize_phrase(phrase):
    if isinstance(phrase, str):  
        return ' '.join([lemmatizer.lemmatize(word) for word in phrase.split()])
    return phrase  


test_data['Phrase'] = test_data['Phrase'].apply(lemmatize_phrase)

In [9]:
train_data.head()

Unnamed: 0,Phrase,Sentiment
0,i mean who need sleep when you can have the sa...,-100
1,im not sure whats more disturbing the fact tha...,-100
2,a neon green jumpsuit with plaid pant i mean w...,-100
3,it just good luck shut up,-100
4,clearly im a functioning adul,1


# Augmentation using Logistic Regression

In [10]:
unlbl_data = train_data[train_data['Sentiment'] == -100] # seperating lablled and unlablled data
lbl_data = train_data[train_data['Sentiment'] != -100]

In [11]:
X = lbl_data['Phrase']
y = lbl_data['Sentiment']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) 

# Pipeline that combines TF-IDF and Logistic Regression 
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=200, solver='saga'))

model.fit(X_train, y_train)


In [12]:
y_pred = model.predict(X_val)

In [13]:
from sklearn.metrics import f1_score, accuracy_score

f1 = f1_score(y_val, y_pred, average='weighted')  
# Calculate Accuracy Score
accuracy = accuracy_score(y_val, y_pred) # testing the acccuracy within the lablled test set

# Print the scores
print(f'F1 Score: {f1:.4f}')
print(f'Accuracy Score: {accuracy:.4f}') 

F1 Score: 0.8308
Accuracy Score: 0.8307


In [14]:
model.fit(X, y) # without train-test split

In [15]:
y_pred = model.predict(X_val)

In [20]:
f1 = f1_score(y_val, y_pred, average='weighted')  

accuracy = accuracy_score(y_val, y_pred)


print(f'F1 Score: {f1:.4f}')
print(f'Accuracy Score: {accuracy:.4f}')

F1 Score: 0.8949
Accuracy Score: 0.8950


In [17]:
prediction = model.predict(unlbl_data['Phrase'])

In [18]:
unlbl_data['Sentiment'] = prediction # augmentation

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlbl_data['Sentiment'] = prediction # augmentation


In [21]:
unlbl_data.head()

Unnamed: 0,Phrase,Sentiment
0,i mean who need sleep when you can have the sa...,3
1,im not sure whats more disturbing the fact tha...,0
2,a neon green jumpsuit with plaid pant i mean w...,3
3,it just good luck shut up,3
5,yet in the memory we hold they remain a remind...,4


# Testing on validation with augmentation

In [22]:
combined_data = pd.concat([lbl_data, unlbl_data], ignore_index=False)

In [23]:
combined_data.head()

Unnamed: 0,Phrase,Sentiment
4,clearly im a functioning adul,1
6,and it not what we thought the truth is out,0
7,guess ill just wing it and hope for the best,3
9,it not like ive been eating pizza and ice crea...,3
15,keep being extra yall,1


In [24]:
len(combined_data)

59701

In [25]:
X = combined_data['Phrase']
y = combined_data['Sentiment']

In [26]:
model.fit(X, y)


In [27]:
X_val = val_data['Phrase']
y_val = val_data['Sentiment']

In [28]:
y_pred = model.predict(X_val)

In [61]:
f1 = f1_score(y_val, y_pred, average='weighted')  # Use 'weighted' for multi-class classification
accuracy = accuracy_score(y_val, y_pred)

print(f'F1 Score: {f1:.4f}')
print(f'Accuracy Score: {accuracy:.4f}')

F1 Score: 0.9121
Accuracy Score: 0.9122


# Prediction on validation without Augmentation

# Prediction on test data

In [64]:
for i in range(5):
    print(len(unlbl_data[unlbl_data['Sentiment'] == i]))

5236
6327
4582
5674
13129


In [65]:
for i in range(5):
    print(unlbl_data[unlbl_data['Sentiment'] == i].head())

                                               Phrase  Sentiment
1   im not sure whats more disturbing the fact tha...          0
17       i dont know if i can handle the truth anymor          0
25  the world most powerful family ha fallen the s...          0
45  the truth is finally out but the damage is alr...          0
63  world leader react with shock and condolegence...          0
                                               Phrase  Sentiment
18       lowkey just a college student trying to adul          1
30        anyone else feel like theyre just winging i          1
36  like my brain is onsleep and my body is like n...          1
44  im pretty sure my life is now a perfectly cura...          1
57  can we just have a conversation where im not t...          1
                                               Phrase  Sentiment
8   nicotine addiction affect million worldwide le...          2
11  it essential to break the stigma surrounding m...          2
22  let work together to 

In [62]:
train_val = pd.concat([combined_data, val_data], ignore_index=False)

In [63]:
model.fit(train_val['Phrase'], train_val['Sentiment'])

In [33]:
prediction = model.predict(test_data['Phrase'])

In [34]:
prediction

array([3, 2, 4, ..., 2, 3, 3])

In [35]:
pred = pd.DataFrame(prediction)

In [35]:
pred

Unnamed: 0,0
0,3
1,2
2,4
3,1
4,4
...,...
23246,2
23247,4
23248,2
23249,3


In [36]:
pred.value_counts()

2    4801
3    4771
4    4665
1    4552
0    4462
dtype: int64

In [37]:
test_data

Unnamed: 0,Phrase,PhraseID
0,i woke up got out of bed and managed to put on...,0
1,the human nose can detect over 1 trillion diff...,1
2,every pixel tell a story every brushstroke a m...,2
3,like im a busy person okay i have a life not a...,3
4,the smell of old book and coffee fill my lung ...,4
...,...,...
23252,learn about common allergen symptom and safety...,23252
23253,where will the horizon take me one day ill fin...,23253
23254,establish a bedtime routine create a sleepcond...,23254
23255,after careful consideration ive decided to wea...,23255


In [None]:
submission_df = pd.DataFrame({
    'PhraseID': test_data['PhraseID'],  # Assuming 'PhraseID' is the correct column name in test.csv
    'Sentiment': prediction
})

In [None]:
submission_df

In [None]:
sample = pd.read_csv('sample_submission.csv')

In [None]:
sample

In [None]:
test_data

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test

In [None]:

submitted_ids = set(submission_df['PhraseID'])


original_ids = set(test['PhraseID'])


missing_ids = original_ids - submitted_ids

print("Missing PhraseIDs:", missing_ids)

In [None]:
missing_rows = test[test['PhraseID'].isin(missing_ids)]

missing_rows['Sentiment'] = 0  # or

In [None]:
submission_df = pd.concat([submission_df, missing_rows], ignore_index=True)

In [None]:

submission_df = submission_df.sort_values(by='PhraseID').reset_index(drop=True)

In [None]:
submission_df

In [None]:
submission_df[submission_df['PhraseID'] == 124]

In [None]:
submission_df = submission_df.drop(columns=['Phrase'])

In [None]:
submission_df

In [None]:
sample

In [None]:
submission_df.to_csv('submissions2.csv', index=False)