In [1]:
# Importing All Required Libraries

# Libraries For Functions

import re
import statistics
from textstat import flesch_reading_ease, lexicon_count
import string
import re
from collections import Counter
from spellchecker import SpellChecker

# Libraries For Model and Pipeline

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Reading Data

df = pd.read_csv('Huggingface(3k).csv')
df = df.drop('Unnamed: 0', axis = 1)

In [3]:
df.head(1)

Unnamed: 0,text,labels
0,"Adventure, that's what it promised, and advent...",1


In [4]:
# Define Functions to extract features from given data set.add

# 1 Lenght Variance
def length_variance(X):
    return [[
        statistics.variance(lengths) if len(lengths) > 1 else 0
    ] for x in X.ravel()
      for lengths in [[len(s.split()) for s in re.split(r'[;.!?]', x) if s.strip()]]
    ]

# 2 Readiability Score
def readability_score(X):
    return [[flesch_reading_ease(x)] for x in X.ravel()]

# 3 Remove punctuatins
def remove_punctuation(X):
    return [[len(x.translate(str.maketrans('', '', string.punctuation)).split())] for x in X.ravel()]

# 4 Repeatition Score
def repetition_score(X):
    return [[
        sum(1 for count in Counter(x.split()).values() if count > 1) / len(x.split()) * 100
    ] for x in X.ravel()]

# 5 Creativity Score
def creativity_score(X):
    return [[
        len(set(x.split())) / len(x.split()) * 100
    ] for x in X.ravel()]

# 6 Typo Mistake Count in percentage
def typo_count(X):
    spell = SpellChecker()
    return [[
        (len(spell.unknown(x.split())) / len(x.split())) * 100 if len(x.split()) > 0 else 0
    ] for x in X.ravel()]





In [5]:
#  Wrap them in FunctionTransformers

feature_union = FeatureUnion([
    ('length_var', FunctionTransformer(length_variance, validate=False)),
    ('readability', FunctionTransformer(readability_score, validate=False)),
    ('punct_clean', FunctionTransformer(remove_punctuation, validate=False)),
    ('repeatition', FunctionTransformer(repetition_score, validate=False)),
    ('creativity_scr', FunctionTransformer(creativity_score, validate=False)),
    ('typos', FunctionTransformer(typo_count, validate=False)),
])

In [6]:
# Combine TFIDF With Other Custom Features

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), 0),
        ('custom', feature_union, 0)
    ]
)

In [7]:
# Define Pipeline with All Features + RandomForest Classifier

pipeline = Pipeline(steps=[
    ('features', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))  # or use GaussianNB(), etc.
])

In [8]:
# Separate Out Features and Labels

X = df[['text']].values   # Data as a 2D array
y = df['labels']

#Apply Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Fit Training Data in Pipeline

pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
# Calculate X_test
y_pred = pipeline.predict(X_test)



In [11]:
# Evaluate Matrix 


print(classification_report(y_test, y_pred))
print()
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       439
           1       0.98      0.98      0.98       461

    accuracy                           0.98       900
   macro avg       0.98      0.98      0.98       900
weighted avg       0.98      0.98      0.98       900


[[432   7]
 [  8 453]]


In [12]:
# Testing External Inputs

text_s = """A galaxy is a massive cosmic system composed of billions to trillions of stars, along with gas, dust, dark matter, and other celestial objects, all bound together by gravity. Galaxies come in various shapes—primarily spiral, elliptical, and irregular—with each type revealing unique characteristics about their formation and evolution. For example, spiral galaxies like the Milky Way feature rotating disks with distinct arms, while elliptical galaxies tend to be more uniform and contain older stars. At their cores, many galaxies harbor supermassive black holes that influence their dynamics. Galaxies often exist in clusters and interact through collisions or mergers, driving star formation and structural changes. Studying galaxies is crucial for understanding the large-scale structure of the universe, the nature of dark matter, and the cosmic history of matter and energy distribution.
"""# AI
text_m = """The Fibonacci series is a sequence of numbers in which each number is the sum of the two preceding ones, typically starting with 0 and 1. The series follows the recursive formula **F(n) = F(n−1) + F(n−2)**, with initial conditions **F(0) = 0** and **F(1) = 1**. This sequence appears in various natural patterns, such as the arrangement of leaves or the spiral of shells, and has applications in mathematics, computer science, and even financial modeling. Theoretically, it illustrates concepts of recursion, growth patterns, and the golden ratio, which the ratio of successive Fibonacci numbers approaches as the series progresses.
"""# AI
text_sh = """The Universe is filled with galaxies. Each Galaxy is billions upon billions of Stars. Our Galaxy is not huge by galactic standards. It is estimated that there are 100 billion stars in our Galaxy! When you look at the Stars at night you are looking at the Stars in our Galaxy. All the stars you see are in our Galaxy. Galaxies come in a number of shapes and sizes. Even a small Galaxy is MASSIVE. Our Galaxy is called “The Milky Way”, it is a pinwheel Galaxy. The next nearest galaxy is Andromeda. It is 2.537 million light years away! Andromeda has 1000 billion stars!

Our Sun lies about 2/3 the way out on one of the legs of the Galaxy. The whole thing is turning and we and our Solar System are orbiting the center of our Galaxy.
""" # Human

text_gh = """These answers are outdated and incorrect. All employees enjoy standby travel to any destination American Airlines serves. If you are the employee, spouse or children there are no charges for travel, Except (international) destinations where taxes apply. 24 hours before departure you are obligated to check in by computer or phone and this places you on the standby list. First come, first serve. You must do your homework before traveling-- if the flight is overbooked don't bother! We can also purchase full tickets off the AA website with a 20% discount of any fare. These "AA20" tickets are considered full fare and you are treated the same as a full fare passenger including assigned seating"""
# human

In [21]:
# Test Case
test_1 = "Climate change is a pressing issue. It is caused by greenhouse gases. In addition, it leads to rising sea levels. In conclusion, we must act swiftly to mitigate its effects."
#(AI)
test_2 = "Quantum computing is the future. It solves problems faster than classical computers. For instance, it can optimize logistics. However, challenges remain in scalability."
#(AI)
test_3 = "Had an amazing hike today! The sunset was unreal, totally worth the climb. #NatureLover"
#(Human)

In [12]:
text = """This is extremely related to my previous point. We tend to be so caught up in our daily lives that sometimes by simply sticking around we may cause more harm than good. Your boss is taking over your life? Kids are driving you mad? Your parents are trying to make you live the life they want? How long do you think you can handle this pressure before you burst and everything falls apart?

Sometimes it is best to take a step back, take a deep breath and take go that Tower Bridge selfie. In all seriousness, travel is not a bad option - it is the most natural way of inducing the feeling you miss someone or that you are missed. The trick is to leave with a bit of preparation to avoid making a mistake during your journey"""

In [13]:
x = [[text]]
print('Human' if pipeline.predict(x) == 0 else 'AI')
print(max(pipeline.predict_proba(x)[0]))

Human
0.99983746


In [14]:
# Creating pkl file model

import joblib
joblib.dump(pipeline,'XGBoost_Text_Classifier.pkl')

['XGBoost_Text_Classifier.pkl']