0-> False

1-> True

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
import pickle


In [None]:
# Downloading NLTK stopwords and stemmer
nltk.download('stopwords')
nltk.download('punkt')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Reading in the dataset
df = pd.read_csv('/content/train.csv')

In [None]:
# Preprocessing steps
def preprocess_text(text):
    # Lowercase the text
    text = str(text).lower()
    
    # Remove non-alphanumeric characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in nltk.corpus.stopwords.words('english')]
    
    # Join the words back into a string
    text = ' '.join(words)
    
    return text

In [None]:
# Applying preprocessing to the dataset
df['author'] = df['author'].apply(preprocess_text)
df['title'] = df['title'].apply(preprocess_text)
df['authorAndTitle'] = df['author']+' '+df['title']

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['authorAndTitle'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Creating TF-IDF vectors from the text
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [None]:
# Training the Passive Aggressive Classifier
clf = PassiveAggressiveClassifier(max_iter=50)
clf.fit(X_train_vectors, y_train)

In [None]:
# Evaluating the model on the testing set
y_pred = clf.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9925480769230769


In [None]:
# Saving the model using pickle
with open('PassiveAggressiveSavedModelTitleAndAuthor.pkl', 'wb') as file:
    pickle.dump(clf, file)

# Example of how to use the saved model on new data
with open('PassiveAggressiveSavedModelTitleAndAuthor.pkl', 'rb') as file:
    clf = pickle.load(file)

new_data = ['Breaking news: aliens spotted in New York City!',
            'Study shows that coffee can reduce the risk of heart disease']
new_data_vectors = vectorizer.transform(new_data)
predictions = clf.predict(new_data_vectors)
print(predictions)

[0 1]
