In [29]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [30]:
# Cell 2: Load data
df = pd.read_csv('../data/ai_human_content_detection_dataset.csv')

In [31]:
# Variables for results and evaluation
results={}

highest_score={
       'model': None,
       'score': 0
}

In [32]:
# Models 
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=100000),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier()
}

In [33]:
# X data with raw numeric-only values
X_numeric=df[['word_count', 'character_count',
       'sentence_count', 'lexical_diversity', 'avg_sentence_length',
       'avg_word_length', 'punctuation_ratio', 'flesch_reading_ease',
       'gunning_fog_index', 'grammar_errors', 'passive_voice_ratio',
       'predictability_score', 'burstiness', 'sentiment_score']]

In [34]:
# Vectorised text_content values (essay/assignment free text) with modification
vectorizer = TfidfVectorizer(  
    max_features=950,  
    ngram_range=(1, 3),  # Include bigrams and trigrams
    min_df=2,  # Ignore very rare words
    max_df=0.9,  # Ignore very common words
    sublinear_tf=True,  # Apply sublinear scaling
    use_idf=True,
    smooth_idf=True
    )  
X_text = vectorizer.fit_transform(df['text_content'])  

In [35]:
# Encoding simple one-worded content_type data to numeric values
X_cat = pd.get_dummies(df['content_type'], drop_first=True)

In [36]:
# Combining all numeric and encoded numeric values in one sparse array
X_non_text = np.hstack([X_numeric.values, X_cat.values])
X_full = hstack([X_non_text, X_text])

In [37]:
# Y value
y=df['label']

In [38]:
# Train/Test data
X_train, X_test, y_train, y_test = train_test_split(
       X_full,
       y,
       test_size=0.2,
       random_state=42,
       stratify=y
)

In [39]:
# Impute nan/missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [40]:
# Train and evaluate all models (no scaling or PCA)
for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"RAW MODEL EVAL: {name}: {accuracy:.3f}")
best_model = max(results, key=results.get)

# Store highest score
if results[best_model] > highest_score["score"]:
       highest_score["score"] = results[best_model]
       highest_score["model"] = best_model

RAW MODEL EVAL: RandomForest: 0.511
RAW MODEL EVAL: LogisticRegression: 0.573
RAW MODEL EVAL: SVM: 0.493
RAW MODEL EVAL: KNN: 0.529


In [41]:
# Scale training and test data
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train_imputed) #fit and transform only on training data
X_test_scaled= scaler.transform(X_test_imputed)

In [42]:
# Convert sparse matrices to dense (FOR PCA)
X_train_scaled_dense = X_train_scaled.toarray()  
X_test_scaled_dense = X_test_scaled.toarray()

In [43]:
# Train and evaluate all models with scaled data
for name, model in models.items():
    model.fit(X_train_scaled_dense, y_train)
    y_pred = model.predict(X_test_scaled_dense)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"SCALED MODEL EVAL (DENSE): {name}: {accuracy:.3f}")
best_model = max(results, key=results.get)

# Store highest score
if results[best_model] > highest_score["score"]:
       highest_score["score"] = results[best_model]
       highest_score["model"] = best_model

SCALED MODEL EVAL (DENSE): RandomForest: 0.511
SCALED MODEL EVAL (DENSE): LogisticRegression: 0.526
SCALED MODEL EVAL (DENSE): SVM: 0.591
SCALED MODEL EVAL (DENSE): KNN: 0.493


In [44]:
# Apply PCA to capture 95% variance
pca=PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled_dense)
X_test_pca = pca.transform(X_test_scaled_dense)

In [45]:
# Train and evaluate all models with scaled + PCA data
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"PCA APPLIED MODEL EVAL {name}: {accuracy:.3f}")
    
# Store highest score
if results[best_model] > highest_score["score"]:
       highest_score["score"] = results[best_model]
       highest_score["model"] = best_model
best_model = max(results, key=results.get)

PCA APPLIED MODEL EVAL RandomForest: 0.507
PCA APPLIED MODEL EVAL LogisticRegression: 0.547
PCA APPLIED MODEL EVAL SVM: 0.602
PCA APPLIED MODEL EVAL KNN: 0.507


In [46]:
# === EXPERIMENT: Optimizing LogisticRegression C parameter ===
# After finding LogisticRegression was initially best, tested different regularization strengths
# RESULT: Default C=1.0 was optimal for 100 features, but with 600 features, C=0.1 performed better
# This led to discovering that more text features (600 vs 100) was more impactful than C tuning
lr_models = {
    'LR_C=0.1': LogisticRegression(C=0.1, random_state=42, max_iter=10000),
    'LR_C=1.0': LogisticRegression(C=1.0, random_state=42, max_iter=10000), 
    'LR_C=10.0': LogisticRegression(C=10.0, random_state=42, max_iter=10000),
}

for name, model in lr_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

In [47]:
print(f"Best RAW Model: {best_model} with accuracy: {results[best_model]:.3f}")

Best RAW Model: SVM with accuracy: 0.602


In [48]:
# THOUGHT PROCESS AND EVAL:
# Grid search with cross-validation and multiple C values worsened score and same with linearSVC
# optimised second best model (lr) with best .54 lower than svm
# optimised nbm model scored second best optimised at 5.8

In [None]:
#LEFT TO TRY
#pca variance tuning
#only options: stick to 60% with svm AND test with only numeric values
# use nbm model (both: convert to positive and try with numeric only)