In [None]:
!pip install scikit-optimize



# New Section

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import required libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# read dataset function
def read_dataset(inFile):
    print("\nReading:", inFile)
    data =  pd.read_csv(inFile, sep='\t')
    return data

In [None]:
# data paths and config
inTrain = '/content/drive/MyDrive/dataset.csv'

max_instances_per_class = 1500
max_features = 20000 # maximum number of features extracted for our instances
random_seed = 777 # set random seed for reproducibility
id2label = {0: "HUMANO", 1: "IA"}

In [None]:
# read dataset
train_df = read_dataset(inTrain)
train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['Label'], random_state=random_seed)


Reading: /content/drive/MyDrive/dataset.csv


In [None]:
from itertools import count
# downsample training data to train faster
train_df = train_df.groupby("Label").sample(n=max_instances_per_class, random_state=random_seed)

In [None]:
#Establecemos el número de instancias presentes
instancias_humanas = len(train_df[train_df['Label'] == 'HUMANO'])
instancias_ia =  len(train_df[train_df['Label'] == 'IA'])
instancias_dataset = len(train_df)

In [None]:
#Sumamos las instancias y realizamos la longitud media
suma_longitudes_humanos = train_df[train_df['Label'] == 'HUMANO']['Text'].apply(len).sum()
longitud_media_humanos = suma_longitudes_humanos / instancias_humanas

suma_longitudes_generados = train_df[train_df['Label'] == 'IA']['Text'].apply(len).sum()
longitud_media_generado = suma_longitudes_generados / instancias_ia

In [None]:
#Imprimimos la Tabla de Estadísticas
print('Número de instancias en el dataset:\t\t\t\t', instancias_dataset)
print('Número de instancias humanas:\t\t\t\t\t', instancias_humanas)
print('Número de instancias generadas:\t\t\t\t\t', instancias_ia)
print('Longitud media en caracteres de las instancias humanas:\t\t', longitud_media_humanos)
print('Longitud media en caracteres de las instancias generadas:\t', longitud_media_generado)

Número de instancias en el dataset:				 3000
Número de instancias humanas:					 1500
Número de instancias generadas:					 1500
Longitud media en caracteres de las instancias humanas:		 489.0926666666667
Longitud media en caracteres de las instancias generadas:	 1308.508


In [None]:
# vectorize data: extract features from our data (from text to numeric vectors)
vectorizer = TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1,1))
X_train = vectorizer.fit_transform(train_df["Text"])
X_test = vectorizer.transform(test_df["Text"])
#print({k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1], reverse=True)})

In [None]:
# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df["Label"])
Y_test = le.transform(test_df["Label"])

In [None]:
# create model
model = ExtraTreesClassifier()

In [None]:
# train model
model.fit(X_train, Y_train)

In [None]:
#Imprimimos Tabla de Estadísticas
print('Número de instancias en el training:\t\t',len(train_df))
print('Número de instancias en el test:\t\t',len(test_df))
print('Número de instancias humanas en el training:\t',len(train_df[train_df['Label'] == 'HUMANO']))
print('Número de instancias generadas en el training:\t',len(train_df[train_df['Label'] == 'IA']))
print('Número de instancias generadas en el test:\t',len(test_df[test_df['Label'] == 'IA']))
print('Número de instancias humanas en el test:\t',len(test_df[test_df['Label'] == 'HUMANO']))

Número de instancias en el training:		 3000
Número de instancias en el test:		 860
Número de instancias humanas en el training:	 1500
Número de instancias generadas en el training:	 1500
Número de instancias generadas en el test:	 466
Número de instancias humanas en el test:	 394


In [None]:
# get test predictions
predictions = model.predict(X_test)

In [None]:
# evaluate predictions
target_names = [label for idx, label in id2label.items()]
print(classification_report(Y_test, predictions, target_names=target_names))


              precision    recall  f1-score   support

      HUMANO       0.86      0.91      0.88       394
          IA       0.92      0.87      0.90       466

    accuracy                           0.89       860
   macro avg       0.89      0.89      0.89       860
weighted avg       0.89      0.89      0.89       860



In [None]:
# classify your own text
custom_texts = ["I'm ChatGPT, your virtual assistant, and I'm generating texts"]
X_custom = vectorizer.transform(custom_texts)
preds = model.predict(X_custom)
print("Classification label:", target_names[preds[0]])

Classification label: HUMANO


In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import all_estimators
from sklearn.base import ClassifierMixin
from sklearn.metrics import f1_score

# Filtrar todas las advertencias de convergencia
warnings.filterwarnings('ignore', category=ConvergenceWarning)

best_score = float('-inf')
best_model = None
top_modelos = []

print('Calculando el Mejor Modelo...')
for name, ClassifierClass in all_estimators(type_filter='classifier'):
      if issubclass(ClassifierClass, ClassifierMixin) and hasattr(ClassifierClass, 'fit'):
        try:
            regressor = ClassifierClass()
            regressor.fit(X_train, Y_train)
            y_pred = regressor.predict(X_test)
            score = f1_score(Y_test, y_pred, average="macro")
            top_modelos.append((score, name, regressor))
            if score > best_score:
                best_score = score
                best_model = regressor
            #print(f"Modelo : {name} | Macro F1: {score}")
        except Exception as e:
          print('.')

#Ordenamos los modelos de mayor a menor
top_modelos.sort(reverse=True, key=lambda x: x[0])

#Establecemos el top de mejores modelos
top_five = top_modelos[:5]

#Establecemos el formato para la tabla
print('\n{:_<50}'.format(""))
print("\n{:^50}".format("--- TOP 5 MEJORES MODELOS ---"))
print('{:_<50}'.format(""))
print("\n{:^5} | {:^25} | {:^15}".format("TOP", "MODELO", "PUNTUACIÓN"))
print('{:_<50}\n'.format(""))

#Imprimimos el top 5 modelos
for i, (score, name, model) in enumerate(top_five, start=1):
  recommended = "<- Modelo Recomendado" if model == best_model else ""
  print("{:^5} | {:^25} | {:^13.6f} | {}".format(i, name, score, recommended))

Calculando el Mejor Modelo...
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.

__________________________________________________

          --- TOP 5 MEJORES MODELOS ---           
__________________________________________________

 TOP  |          MODELO           |   PUNTUACIÓN   
__________________________________________________

  1   |   ExtraTreesClassifier    |   0.891564    | <- Modelo Recomendado
  2   |       MLPClassifier       |   0.878014    | 
  3   |      RidgeClassifier      |   0.877541    | 
  4   |     RidgeClassifierCV     |   0.877541    | 
  5   |  CalibratedClassifierCV   |   0.877441    | 
