In [None]:
#task1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


df = pd.read_csv('dataset.csv') 


X = df['Tweet text']  
y = df['Sentiment']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


rf_model = RandomForestClassifier()
rf_model.fit(X_train_vectorized, y_train)


predictions = rf_model.predict(X_test_vectorized)


accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
#Task2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


df = pd.read_csv('dataset.csv')  


X = df['Tweet text']  
y = df['User ID']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


rf_model = RandomForestClassifier()
rf_model.fit(X_train_vectorized, y_train)


predictions = rf_model.predict(X_test_vectorized)


accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
#task3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


df = pd.read_csv('dataset.csv')  




X = df['Tweet text']  


vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)


num_clusters = 5  
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X_vectorized)


silhouette_avg = silhouette_score(X_vectorized, kmeans.labels_)
print("Silhouette Score:", silhouette_avg)


cluster_labels = kmeans.labels_


df['Cluster Label'] = cluster_labels


for cluster_label in range(num_clusters):
    cluster_tweets = df[df['Cluster Label'] == cluster_label]['Tweet text']
    print(f"Cluster {cluster_label}:")
    for tweet in cluster_tweets:
        print(tweet)
    print()

In [None]:
#task4
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


df = pd.read_csv('dataset.csv')  


X = df[['Tweet text', 'User information', 'Hashtags used']]  
y = df[['Retweet count', 'Favorite count', 'Reply count']]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train['Tweet text'])
X_test_text = vectorizer.transform(X_test['Tweet text'])


X_train_features = pd.concat([pd.DataFrame(X_train_text.toarray()), X_train[['User information', 'Hashtags used']]], axis=1)
X_test_features = pd.concat([pd.DataFrame(X_test_text.toarray()), X_test[['User information', 'Hashtags used']]], axis=1)


rf = RandomForestRegressor()
rf.fit(X_train_features, y_train)


y_pred = rf.predict(X_test_features)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)