# NUMERIC-ONLY FEATURE

Quick experiment to test if the numeric-only features available can distinguish AI vs Human writing

**Result**: ~55% accuracy with numeric-only feature approach vs 60.2% with full features

**Conclusion**: The full features approach provides meaningful improvement over numeric-only feature approach

*Note: This was an exploratory experiment. See main.py for the complete, production-ready pipeline.*

In [None]:
# imports 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [5]:
# Load data
df = pd.read_csv('../data/ai_human_content_detection_dataset.csv')

In [6]:
# Variables for storing and evaluating 
results={}

highest_score={
       'model': None,
       'score': 0
}

In [7]:
# Models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=100000),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier()
}

In [8]:
# Only numeric values from original data
X_numeric=df[['word_count', 'character_count',
       'sentence_count', 'lexical_diversity', 'avg_sentence_length',
       'avg_word_length', 'punctuation_ratio', 'flesch_reading_ease',
       'gunning_fog_index', 'grammar_errors', 'passive_voice_ratio',
       'predictability_score', 'burstiness', 'sentiment_score']]

In [9]:
# Desired output
y=df['label']

In [10]:
# Split training/test
X_train, X_test, y_train, y_test = train_test_split(
       X_numeric,
       y,
       test_size=0.2,
       random_state=42,
       stratify=y
)

In [11]:
# Impute
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [12]:
# Train and eval raw model
for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"RAW MODEL EVAL: {name}: {accuracy:.3f}")
best_model = max(results, key=results.get)
if results[best_model] > highest_score["score"]:
       highest_score["score"] = results[best_model]
       highest_score["model"] = best_model
print("")
print(f"Best RAW Model: {best_model} with accuracy: {results[best_model]:.3f}")

RAW MODEL EVAL: RandomForest: 0.504
RAW MODEL EVAL: LogisticRegression: 0.522
RAW MODEL EVAL: SVM: 0.493
RAW MODEL EVAL: KNN: 0.529

Best RAW Model: KNN with accuracy: 0.529


In [13]:
# Scale
scaler = StandardScaler(with_mean=False)  
X_train_scaled = scaler.fit_transform(X_train_imputed) 
X_test_scaled= scaler.transform(X_test_imputed)

In [14]:
# Train and evaluate all models with scaled data
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"SCALED MODEL EVAL (DENSE): {name}: {accuracy:.3f}")
best_model = max(results, key=results.get)
if results[best_model] > highest_score["score"]:
       highest_score["score"] = results[best_model]
       highest_score["model"] = best_model
print(f"Best SCALED Model: {best_model} with accuracy: {results[best_model]:.3f}")

SCALED MODEL EVAL (DENSE): RandomForest: 0.504
SCALED MODEL EVAL (DENSE): LogisticRegression: 0.533
SCALED MODEL EVAL (DENSE): SVM: 0.500
SCALED MODEL EVAL (DENSE): KNN: 0.551
Best SCALED Model: KNN with accuracy: 0.551


In [15]:
# Apply PCA to capture 95% variance
pca=PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [16]:
# Train and evaluate all models with scaled + PCA data
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"PCA APPLIED MODEL EVAL {name}: {accuracy:.3f}")
if results[best_model] > highest_score["score"]:
       highest_score["score"] = results[best_model]
       highest_score["model"] = best_model
best_model = max(results, key=results.get)
print(f"Best PCA Model: {best_model} with accuracy: {results[best_model]:.3f}")

PCA APPLIED MODEL EVAL RandomForest: 0.555
PCA APPLIED MODEL EVAL LogisticRegression: 0.500
PCA APPLIED MODEL EVAL SVM: 0.507
PCA APPLIED MODEL EVAL KNN: 0.536
Best PCA Model: RandomForest with accuracy: 0.555


In [17]:
print(f"BEST SCORING MODEL: {highest_score['model']} with accuracy: {highest_score['score']:.3f}")

BEST SCORING MODEL: KNN with accuracy: 0.551


In [None]:
# performed worse without text content 