In [19]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score

In [20]:
val_data = pd.read_csv('val.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Pre - Processing

In [21]:
train_data['Phrase'] = train_data['Phrase'].str.lower()
train_data['Phrase'] = train_data['Phrase'].str.replace(r'[^\w\s]', '', regex=True)
train_data['Phrase'] = train_data['Phrase'].str.replace(r'http\S+', '', regex=True)
train_data = train_data.dropna()


lemmatizer = WordNetLemmatizer()


def lemmatize_phrase(phrase):
    if isinstance(phrase, str):
        return ' '.join([lemmatizer.lemmatize(word) for word in phrase.split()])
    return phrase 

train_data['Phrase'] = train_data['Phrase'].apply(lemmatize_phrase)

In [22]:
val_data['Phrase'] = val_data['Phrase'].str.lower()
val_data['Phrase'] = val_data['Phrase'].str.replace(r'[^\w\s]', '', regex=True)
val_data['Phrase'] = val_data['Phrase'].str.replace(r'http\S+', '', regex=True)
val_data = val_data.dropna()

lemmatizer = WordNetLemmatizer()


def lemmatize_phrase(phrase):
    if isinstance(phrase, str):  
        return ' '.join([lemmatizer.lemmatize(word) for word in phrase.split()])
    return phrase  

val_data['Phrase'] = val_data['Phrase'].apply(lemmatize_phrase)

In [23]:
test_data['Phrase'] = test_data['Phrase'].str.lower()
test_data['Phrase'] = test_data['Phrase'].str.replace(r'[^\w\s]', '', regex=True)
test_data['Phrase'] = test_data['Phrase'].str.replace(r'http\S+', '', regex=True)
test_data = test_data.dropna()

lemmatizer = WordNetLemmatizer()

def lemmatize_phrase(phrase):
    if isinstance(phrase, str):  
        return ' '.join([lemmatizer.lemmatize(word) for word in phrase.split()])
    return phrase  


test_data['Phrase'] = test_data['Phrase'].apply(lemmatize_phrase)

# Augmentation using K - means Clustering

In [24]:
unlbl_data = train_data[train_data['Sentiment'] == -100] # seperating lablled and unlablled data
lbl_data = train_data[train_data['Sentiment'] != -100]

In [10]:
!pip install --upgrade threadpoolctl scikit-learn

!pip uninstall openblas
!pip install openblas

[0m[31mERROR: Could not find a version that satisfies the requirement openblas (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for openblas[0m[31m
[0m

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Extract the text data from 'unlbl_data' for vectorization
text_data = unlbl_data['Phrase']  # Replace 'text_column_name' with the name of the column containing text

# Initialize and apply TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Define number of clusters
num_clusters = 5

# Run K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
unlbl_data['Cluster'] = kmeans.fit_predict(tfidf_matrix)

# Display the resulting clusters
print(unlbl_data[['Phrase', 'Cluster']].head())  # Replace 'text_column_name' with the actual text column name




AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

# Reduce the dimensionality of the TF-IDF matrix to 2D using PCA
pca = PCA(n_components=2, random_state=42)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# Add the PCA components and cluster labels to the DataFrame for plotting
unlbl_data['PCA1'] = reduced_data[:, 0]
unlbl_data['PCA2'] = reduced_data[:, 1]

# Plot the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=unlbl_data, palette='viridis', s=60)
plt.title('K-means Clusters of Phrases (2D PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

In [None]:
for i in range(5):
    print(len(unlbl_data[unlbl_data['Cluster'] == i]))

In [None]:
for i in range(5):
    print(unlbl_data[unlbl_data['Cluster'] == i].head())
    
 

In [None]:
cluster_sentiment_mapping = {
    0: [1, 0, 0, 0, 0],  # Cluster 0 is Positive
    1: [0, 1, 0, 0, 0],  # Cluster 1 is Negative
    2: [0, 0, 1, 0, 0],  # Cluster 2 is Neutral
    3: [0, 0, 0, 1, 0],  # Cluster 3 is Mixed
    4: [0, 0, 0, 0, 1],  # Cluster 4 is Ambiguous
}

In [None]:
sentiment_labels = [0, 1, 2, 3, 4]

In [None]:
for label in sentiment_labels:
    unlbl_data[label] = 0  # Initialize all sentiment columns to 0

# Step 2: Map clusters to sentiment columns
for cluster, sentiments in cluster_sentiment_mapping.items():
    for i, label in enumerate(sentiment_labels):
        unlbl_data.loc[unlbl_data['Cluster'] == cluster, label] = sentiments[i]

# Display the updated DataFrame
unlbl_data

In [None]:
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=200, solver='saga'))

In [None]:
for i in range(5):
    print(unlbl_data[unlbl_data['Cluster'] == i].head())

In [None]:
for i in range(5):
    model.fit(unlbl_data['Phrase'], unlbl_data[i])
    y_pred = model.predict(val_data['Phrase'])

    f1 = f1_score(val_data['Sentiment'], y_pred, average='weighted')  
    # Calculate Accuracy Score
    accuracy = accuracy_score(val_data['Sentiment'], y_pred) # testing the acccuracy within the lablled test set

    # Print the scores
    print(f'F1 Score: {f1:.4f}')
    print(f'Accuracy Score: {accuracy:.4f}') 

In [None]:
unlbl_data['Sentiment'] = unlbl_data[1]

In [None]:
unlbl_data

In [None]:
unlbl_data_filtered = unlbl_data[['Phrase', 'Sentiment']]

In [None]:
unlbl_data_filtered

In [None]:
combined_data = pd.concat([lbl_data, unlbl_data_filtered], ignore_index=False)

In [None]:
combined_data.head()