In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

In [1]:
# Install required libraries if not already installed
!pip install nltk
!pip install emoji

# Import necessary libraries
import pandas as pd
import re
import string
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m590.6/590.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Build a Text Cleaning Pipeline

In [3]:
def text_cleaning_pipeline(dataset,rule="lemmatize"):
    """
    Function to clean and preprocess text data.
    """
    data=dataset.lower()
    data=re.sub(r'http\S+|www\S+|https\S+','',data,flags=re.MULTILINE)
    data=emoji.replace_emoji(data,replace='')
    data=re.sub(r'@\w+|#\w+|[^A-Za-z\s]','',data)
    tokens=data.split()
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]
    lemmatizer=WordNetLemmatizer()
    stemmer=PorterStemmer()
    if rule=="lemmatize":
        tokens=[lemmatizer.lemmatize(word) for word in tokens]
    elif rule=="stem":
        tokens=[stemmer.stem(word) for word in tokens]
    else:
        print("Pick between lemmatize or stem")
    return " ".join(tokens)


# Text Classification using Machine Learning Models


### üìù Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [4]:
df=pd.read_csv('/content/drive/MyDrive/ARTIFICIAL INTELLIGENCE AND MACHINE LEARNING/Copy of trum_tweet_sentiment_analysis.csv')
df.head()


Unnamed: 0,text,Sentiment
0,RT @JohnLeguizamo: #trump not draining swamp b...,0
1,ICYMI: Hackers Rig FM Radio Stations To Play A...,0
2,Trump protests: LGBTQ rally in New York https:...,1
3,"""Hi I'm Piers Morgan. David Beckham is awful b...",0
4,RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...,0


In [5]:
df['cleaned_text']=df['text'].apply(lambda x:text_cleaning_pipeline(x,rule="lemmatize"))
df[['text','cleaned_text']].head()


Unnamed: 0,text,cleaned_text
0,RT @JohnLeguizamo: #trump not draining swamp b...,rt draining swamp taxpayer dollar trip adverti...
1,ICYMI: Hackers Rig FM Radio Stations To Play A...,icymi hacker rig fm radio station play antitru...
2,Trump protests: LGBTQ rally in New York https:...,trump protest lgbtq rally new york via
3,"""Hi I'm Piers Morgan. David Beckham is awful b...",hi im pier morgan david beckham awful donald t...
4,RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...,rt tech firm suing buzzfeed publishing unverif...


In [8]:
print(df.columns)


Index(['text', 'Sentiment', 'cleaned_text'], dtype='object')


In [9]:
X_train,X_test,y_train,y_test=train_test_split(df['cleaned_text'],df['Sentiment'],test_size=0.2,random_state=42)


In [10]:
tfidf_vectorizer=TfidfVectorizer()
X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf=tfidf_vectorizer.transform(X_test)


In [11]:
model=LogisticRegression()
model.fit(X_train_tfidf,y_train)
y_pred=model.predict(X_test_tfidf)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96    248563
           1       0.93      0.90      0.91    121462

    accuracy                           0.94    370025
   macro avg       0.94      0.93      0.93    370025
weighted avg       0.94      0.94      0.94    370025

