In [None]:
#Imports isolated here so on restart only this has to be run

import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [None]:
#Splitting the data and putting it through the logistic regression

# File path
file_path = '995,000_rows.csv'

def preprocess_content(content):
    tokens = nltk.word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)


X_train, X_val, X_test, y_train, y_val, y_test = [], [], [], [], [], []
chunk_size = 10000
current_chunk = 0

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    current_chunk += 1
    print(f"Processing chunk {current_chunk}...")

    chunk = chunk[['content', 'type']].dropna()  
    chunk['content'] = chunk['content'].apply(preprocess_content)

    X_chunk = chunk['content']
    y_chunk = chunk['type']

    X_train_chunk, X_temp, y_train_chunk, y_temp = train_test_split(X_chunk, y_chunk, test_size=0.2, random_state=42)
    X_val_chunk, X_test_chunk, y_val_chunk, y_test_chunk = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    X_train.extend(X_train_chunk)
    y_train.extend(y_train_chunk)
    X_val.extend(X_val_chunk)
    y_val.extend(y_val_chunk)
    X_test.extend(X_test_chunk)
    y_test.extend(y_test_chunk)

    print(f"Processed chunk {current_chunk}.")

X_train, X_val, X_test = pd.Series(X_train), pd.Series(X_val), pd.Series(X_test)
y_train, y_val, y_test = pd.Series(y_train), pd.Series(y_val), pd.Series(y_test)


train_data = pd.DataFrame({'content': X_train, 'type': y_train})
val_data = pd.DataFrame({'content': X_val, 'type': y_val})
test_data = pd.DataFrame({'content': X_test, 'type': y_test})

train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


model = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000, random_state=42)
)

model.fit(X_train, y_train)

test_accuracy = accuracy_score(y_test, model.predict(X_test))

print(f"Test Accuracy: {test_accuracy:.2f}")


Processing chunk 1...
Processed chunk 1.
Processing chunk 2...
Processed chunk 2.
Processing chunk 3...
Processed chunk 3.
Processing chunk 4...
Processed chunk 4.
Processing chunk 5...
Processed chunk 5.
Processing chunk 6...
Processed chunk 6.
Processing chunk 7...
Processed chunk 7.
Processing chunk 8...
Processed chunk 8.
Processing chunk 9...
Processed chunk 9.
Processing chunk 10...
Processed chunk 10.
Processing chunk 11...
Processed chunk 11.
Processing chunk 12...
Processed chunk 12.
Processing chunk 13...
Processed chunk 13.
Processing chunk 14...
Processed chunk 14.
Processing chunk 15...
Processed chunk 15.
Processing chunk 16...
Processed chunk 16.
Processing chunk 17...
Processed chunk 17.
Processing chunk 18...
Processed chunk 18.
Processing chunk 19...
Processed chunk 19.
Processing chunk 20...
Processed chunk 20.
Processing chunk 21...
Processed chunk 21.
Processing chunk 22...
Processed chunk 22.
Processing chunk 23...
Processed chunk 23.
Processing chunk 24...
Proces

In [None]:
#Saving as a panda dataframe
train_data = pd.DataFrame({'content': X_train, 'type': y_train})
val_data = pd.DataFrame({'content': X_val, 'type': y_val})
test_data = pd.DataFrame({'content': X_test, 'type': y_test})

train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [1]:
#Technically part of Part 4: Evaluation, but I included it in this notebook because it uses this model.


def preprocess_content(content):
    tokens = nltk.word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

train_data = pd.read_csv('liar_train.tsv', sep='\t')
val_data = pd.read_csv('liar_valid.tsv', sep='\t')   
test_data = pd.read_csv('liar_test.tsv', sep='\t') 

train_data['content'] = train_data['content'].apply(preprocess_content)
val_data['content'] = val_data['content'].apply(preprocess_content)
test_data['content'] = test_data['content'].apply(preprocess_content)

X_train = train_data['content']
y_train = train_data['type']
X_val = val_data['content']
y_val = val_data['type']
X_test = test_data['content']
y_test = test_data['type']

model = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000, random_state=42)
)

model.fit(X_train, y_train)

test_accuracy = accuracy_score(y_test, model.predict(X_test))

print(f"Test Accuracy: {test_accuracy:.2f}")


NameError: name 'pd' is not defined