# Q1 

In [None]:
from  nltk.stem  import  WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import pandas as pd
import nltk
import re, string
stop_words = set(stopwords.words('english'))

In [5]:
data = pd.read_csv('corona_fake.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159 entries, 0 to 1158
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   1077 non-null   object
 1   text    1151 non-null   object
 2   source  1142 non-null   object
 3   label   1159 non-null   object
dtypes: object(4)
memory usage: 36.3+ KB


In [56]:
#wordnetlemmatizer uses their own pos tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

def strip_emoji(text):
    RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    return RE_EMOJI.sub(r'', text)

def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tagged = nltk.pos_tag(tokens)
    lemmatized_tokens = []
    lemmmatizer = WordNetLemmatizer()
    for token, tag in tagged:
        if tag:
            lemmatized_tokens.append(lemmmatizer.lemmatize(word=token, pos=get_wordnet_pos(tag)))
        else:
            lemmatized_tokens.append(lemmmatizer.lemmatize(token))
    
    tokens = [word for word in lemmatized_tokens if word not in stop_words]

    #remove  numbers,  words  that  are  shorter  than  2 characters, punctuation, links and emojis.
    tokens = [re.sub(r"\b[0-9]+\b\s*", '', text) for text in tokens]
    translator = str.maketrans("","", string.punctuation)
    tokens = [text.translate(translator) for text in tokens]
    tokens = [re.sub(r"https?://\S+", "", text) for text in tokens]
    tokens = [re.sub(r"[^A-Za-z0-9\s]+", "", text) for text in tokens] #also remove special chars if any
    tokens = [strip_emoji(text) for text in tokens]
    tokens = [text for text in tokens if len(text)>2]

    return " ".join(tokens)



In [58]:
data['text_clean'] = data['text'].astype(str).apply(preprocess)

# Q2

## a:
An N Gram is a sequence of N adjacent symbols, with a specified order.

In the context of NLP, N-gram captures multi word semantics and word order which is important for models like Bag of Words to represent the corpus correctly.

## b:

In [59]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## c:

In [None]:
cvectorizer1 = CountVectorizer(ngram_range=(1,1), lowercase=True)
cvectorizer2 = CountVectorizer(ngram_range=(1,2), lowercase=True)
cvectorizer3 = CountVectorizer(ngram_range=(1,3), lowercase=True)

vector1 = cvectorizer1.fit_transform(data['text_clean'])
vector2 = cvectorizer2.fit_transform(data['text_clean'])
vector3 = cvectorizer3.fit_transform(data['text_clean'])


## d:

In [62]:
tfid1 = TfidfVectorizer(ngram_range=(1,1), lowercase=True)
tfid2 = TfidfVectorizer(ngram_range=(1,2), lowercase=True)
tfid3 = TfidfVectorizer(ngram_range=(1,3), lowercase=True)

tfvector1 = tfid1.fit_transform(data['text_clean'])
tfvector2 = tfid2.fit_transform(data['text_clean'])
tfvector3 = tfid3.fit_transform(data['text_clean'])

# Q3

In [78]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

target = data['label']
vectors = [vector1, vector2, vector3, tfvector1, tfvector2, tfvector3]
methods = [f'Count Vectorizer with N-Gram (1, {i})' for i in range(1,4)]+[f'TfIDf Vectorizer with N-Gram (1, {i})' for i in range(1,4)]
accuracies = []
for vec, method in zip(vectors, methods):
    X_train, X_test, y_train, y_test = train_test_split(vec, target, train_size=0.7, random_state=276)
    reg = LogisticRegressionCV(cv = 5, random_state = 265,  max_iter = 1000,  n_jobs = -1)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    print(f"Accuracy of {method}:{acc}")
    accuracies.append(acc)



Accuracy of Count Vectorizer with N-Gram (1, 1):0.882183908045977
Accuracy of Count Vectorizer with N-Gram (1, 2):0.8936781609195402
Accuracy of Count Vectorizer with N-Gram (1, 3):0.8879310344827587
Accuracy of TfIDf Vectorizer with N-Gram (1, 1):0.9224137931034483
Accuracy of TfIDf Vectorizer with N-Gram (1, 2):0.9137931034482759
Accuracy of TfIDf Vectorizer with N-Gram (1, 3):0.9080459770114943


In [79]:
acc_df = pd.DataFrame({'Method': methods, 'Accuracy Score': accuracies})
acc_df


Unnamed: 0,Method,Accuracy Score
0,"Count Vectorizer with N-Gram (1, 1)",0.882184
1,"Count Vectorizer with N-Gram (1, 2)",0.893678
2,"Count Vectorizer with N-Gram (1, 3)",0.887931
3,"TfIDf Vectorizer with N-Gram (1, 1)",0.922414
4,"TfIDf Vectorizer with N-Gram (1, 2)",0.913793
5,"TfIDf Vectorizer with N-Gram (1, 3)",0.908046


# Q4

## a :  Newton-CG (Newton’s Conjugate Gradient Method)
**Newton’s method** approximates roots of  𝑓(𝑥)=0  by starting with an initial approximation  𝑥0 , then uses tangent lines to the graph of  𝑓  to update its approximations, powered by the first and second derivative of the function. This is then used to try to minimize the log likelihood which helps estimate the distribution parameters through MLE.

$$
\beta^{(t+1)} = \beta^{(t)} - H^{-1} \nabla L(\beta)
$$

Where:
- \( \beta \) represents the parameter vector.
- \( H \) is the Hessian matrix (second-order partial derivatives of the loss function).
- $\nabla L(\beta)$ is the gradient of the loss function.

Specifially, in logistic regression, the `newton-cg` solver is used, which is based on the Newton’s method combined with conjugate gradient descent. Conjugate Gradient inverts the Hessian matrix iteratively, which is faster and more memory-efficient than a direct inversion. It works especially well when the dataset is large, and computing the full Hessian explicitly is expensive.

_____



## b. L-BFGS (Limited-memory Broyden-Fletcher-Goldfarb-Shanno)
`lbfgs` is a quasi-Newton optimization algorithm that approximates the second-order derivatives without calculating the full Hessian matrix. Like other optimization algorithms, it calculates the gradient of the log-likelihood function but instead of calculating the exact second derivatives, it uses the last m gradient updates to estimate it.

L-BFGS approximates the inverse Hessian using past gradients and updates the parameters:

$$
\beta^{(t+1)} = \beta^{(t)} - \alpha H^{-1} \nabla L(\beta)
$$

Where:
- $\alpha$ is the learning rate.
- $H^{-1}$ is the approximated inverse Hessian matrix.
- $\nabla L(\beta)$ is the gradient.

---

## c. Liblinear (Coordinate Descent for L1 and L2 Penalties)

It’s based on the LIBLINEAR library—specifically designed for large-scale linear classification. 

For L1 Regularization: It uses a coordinate descent method, which updates one parameter at a time while keeping others fixed. This is efficient for enforcing sparsity (many parameters being exactly zero).

For L2 Regularization: It uses a trust region Newton method—a variation of Newton’s method that restricts updates to a safe region to avoid overshooting.

One-vs-Rest (OvR) for Multi-Class: Since it cannot handle multinomial loss directly, it fits one binary model per class and combines them.
Liblinear solves the optimization problem using coordinate-wise updates:

$$
\min_{\beta} \left( L(\beta) + \lambda \|\beta\|_1 + \frac{1}{2} \lambda_2 \|\beta\|_2^2 \right)
$$

Where:
- $L(\beta)$ is the loss function (log-loss for logistic regression).
- $\lambda \|\beta\|_1$ is the L1 regularization term (for sparsity).
- $\lambda_2 \|\beta\|_2^2$ is the L2 regularization term (for weight shrinkage).

---

## d. SAG (Stochastic Average Gradient)
SAG is a variance-reduced version of Stochastic Gradient Descent (SGD). Instead of updating parameters based on a single random sample, it keeps track of and averages all previous gradients, improving convergence speed and stability. Due to averaging, there is fast convergence on large and high-dimensional data. It only works with L2 regularization. In SAG, each update relies on the gradient of a single randomly chosen sample, combined with the average of previous gradients. This leads to biased gradient estimates.

$$
\beta^{(t+1)} = \beta^{(t)} - \eta \left( \bar{g} \right)
$$

Where:

- $\eta$ is the learning rate.
- $g_i$ is the gradient of the \( i \)-th sample.
- $\bar{g} = \frac{1}{n} \sum_{i=1}^{n} g_i $ is the average of all stored gradients.
- $n$ is the number of data points.

---

## e. SAGA (SAG with Variance Reduction)

SAGA is an improved version of SAG that adds support for non-smooth penalties (like L1). It corrects bias of SAG to keep estiimators unbiased. It uses updates to manage L1 regularization, which can force some coefficients to zero to keep data sparsity.

$$
\beta^{(t+1)} = \beta^{(t)} - \eta \left( \nabla L_i(\beta) - g_i + \bar{g} \right)
$$

Where:
- $g_i$ is the stored gradient for the $i$-th sample.
- $\bar{g}$ is the average gradient across all samples.
- $\eta$ is the learning rate.

---


