In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score



In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------------- -------------- 8.1/12.8 MB 41.8 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 35.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 30.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
df = pd.read_csv("processed_dataset.csv", low_memory=False)
df = df[['clean_post', 'subreddit']].dropna()

nlp = spacy.load("en_core_web_sm")

In [6]:
def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]  
    return " ".join(entities)  

In [8]:
df_sample = df[:10]

In [None]:
entities_list = df_sample['clean_post'].apply(extract_named_entities).tolist()
entities_list


['',
 'johns google yesterday first ps several thousand',
 '9194049233 15',
 '',
 'grad school physics branching three months 5',
 '',
 'können einem bedarf vorfeld gedanken über zu messen',
 '7k 1k day',
 '',
 'ga 3 days google google one']

In [12]:
chunk_size = 1000  
num_chunks = len(df) // chunk_size + 1

all_entities = []

for i in range(num_chunks):
    print(f"Procesando bloque {i+1} de {num_chunks}")
    df_chunk = df[i * chunk_size:(i + 1) * chunk_size]
    entities_chunk = df_chunk['clean_post'].apply(extract_named_entities).tolist()
    all_entities.extend(entities_chunk)  

Procesando bloque 1 de 273
Procesando bloque 2 de 273
Procesando bloque 3 de 273
Procesando bloque 4 de 273
Procesando bloque 5 de 273
Procesando bloque 6 de 273
Procesando bloque 7 de 273
Procesando bloque 8 de 273
Procesando bloque 9 de 273
Procesando bloque 10 de 273
Procesando bloque 11 de 273
Procesando bloque 12 de 273
Procesando bloque 13 de 273
Procesando bloque 14 de 273
Procesando bloque 15 de 273
Procesando bloque 16 de 273
Procesando bloque 17 de 273
Procesando bloque 18 de 273
Procesando bloque 19 de 273
Procesando bloque 20 de 273
Procesando bloque 21 de 273
Procesando bloque 22 de 273
Procesando bloque 23 de 273
Procesando bloque 24 de 273
Procesando bloque 25 de 273
Procesando bloque 26 de 273
Procesando bloque 27 de 273
Procesando bloque 28 de 273
Procesando bloque 29 de 273
Procesando bloque 30 de 273
Procesando bloque 31 de 273
Procesando bloque 32 de 273
Procesando bloque 33 de 273
Procesando bloque 34 de 273
Procesando bloque 35 de 273
Procesando bloque 36 de 273
P

In [14]:
all_entities

['',
 'johns google yesterday first ps several thousand',
 '9194049233 15',
 '',
 'grad school physics branching three months 5',
 '',
 'können einem bedarf vorfeld gedanken über zu messen',
 '7k 1k day',
 '',
 'ga 3 days google google one',
 'zero',
 '',
 '',
 'google one',
 'russian spanish',
 'spss',
 '',
 '',
 '2009',
 'first three month 7609 month march 2011 104716 2010 89517 1698 april 2011 60989 2010 60188 133 may 2011 99639 2010 1340',
 'google',
 'one one ga second',
 '',
 'first 1 2 3',
 '',
 'first 3',
 'ga one',
 '',
 '',
 '8 one 1 one 2 3 4 zero',
 '',
 '',
 'ga ga',
 '',
 'joseph tennessee one',
 'ga',
 '',
 'two',
 '',
 '14k 301',
 'ga 301 301 third',
 'yahoo microsoft google google yahoo 1 2 yahoo cpc yahoo 3 microsoft google 1 2 yahoo cpc microsoft camplign 3 microsoft',
 '',
 'one',
 'cms',
 'every month google',
 'bitchen wolframalpha 09 064757 2012',
 'today 79 35 48 4554',
 '',
 'first year ibm',
 '6 week two 1577366 9371 178 9371 1577366 2958 019 ga ga 20000 1 2 3

In [15]:
vectorizer = CountVectorizer(max_features=5000)  
X = vectorizer.fit_transform(all_entities)

In [16]:
y = df['subreddit']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

In [18]:
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-Score (Weighted): {f1}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))

F1-Score (Weighted): 0.18211966719430864

Classification Report:
                       precision    recall  f1-score   support

       AskStatistics       0.27      0.28      0.28      9056
     DataScienceJobs       0.49      0.29      0.36       688
         MLQuestions       0.08      0.02      0.03      3410
     MachineLearning       0.23      0.19      0.21     11223
           analytics       0.34      0.18      0.24      2349
          artificial       0.12      0.03      0.04      2621
     computerscience       0.26      0.13      0.17      6712
      computervision       0.23      0.10      0.14      2925
                data       0.52      0.21      0.30       799
        dataanalysis       0.07      0.02      0.03      1214
     dataengineering       0.26      0.07      0.11      2467
         datascience       0.17      0.59      0.26     11168
  datascienceproject       0.06      0.03      0.04        76
            datasets       0.21      0.08      0.12      3442
   