# Import required libraries and functions

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('preprocessed_data.csv')

In [4]:
print(data)

                                                     text  generated  \
0       cars. cars have been around since they became ...        0.0   
1       transportation is a large necessity in most co...        0.0   
2       "america's love affair with it's vehicles seem...        0.0   
3       how often do you ride in a car? do you drive a...        0.0   
4       cars are a wonderful thing. they are perhaps o...        0.0   
...                                                   ...        ...   
136377  some schools require students to complete summ...        0.0   
136378   if you could visit tne place in the world, wh...        1.0   
136379   in my opinion, emerson's statement that every...        1.0   
136380   tht challenge of exploring venus \n\nby woodr...        1.0   
136381  if we want driverless cars so bad why don't we...        0.0   

                                           tokenized_text  \
0       ['cars', '.', 'cars', 'have', 'been', 'around'...   
1       ['tra

In [5]:
X = data['filtered_text']
y = data['generated']

In [6]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [7]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val_test)

In [11]:
logreg = LogisticRegression(max_iter=1000, penalty='l2')
logreg.fit(X_train_vectorized, y_train)

In [12]:
scores = cross_val_score(logreg, X_train_vectorized, y_train, cv=5)

In [13]:
y_val_pred = logreg.predict(X_val_vectorized)
accuracy = accuracy_score(y_val_test, y_val_pred)
print("Validation accuracy: ", accuracy)
print("Classification Report: ")
print(classification_report(y_val_test, y_val_pred))
print(scores)

Validation accuracy:  0.9958939766103311
Classification Report: 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     15223
         1.0       1.00      1.00      1.00     12054

    accuracy                           1.00     27277
   macro avg       1.00      1.00      1.00     27277
weighted avg       1.00      1.00      1.00     27277

[0.99518812 0.99541726 0.99592136 0.99509647 0.99500481]
