In [22]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import LinearSVC
import os
import joblib
import gradio as gr
from sklearn.metrics import classification_report
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from fastapi import FastAPI
from pydantic import BaseModel


## Investigate the Dataset

In [2]:

# Load dataset
df = pd.read_csv("../data/task.csv")

# Display basic info
print("First 5 rows:")
print(df.head())

print("\nDataFrame Info:")
print(df.info())

print("\nLabel Distribution:")
print(df['Tag'].value_counts())



First 5 rows:
   Unnamed: 0                    Title  \
0           0  I tre volti della paura   
1           1        Mitt liv som hund   
2           2                The Brood   
3           3              The Haunted   
4           4        The Frozen Ground   

                                            Synopsis         Tag  
0  Note: this synopsis is for the orginal Italian...        cult  
1  The action takes place in the years 1958-1959 ...        cult  
2  At the Somafree Institute, Dr. Hal Raglan humi...        cult  
3  This creepy and scary story centers around The...  paranormal  
4  The film opens in an Anchorage motel room in 1...    dramatic  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1566 non-null   int64 
 1   Title       1566 non-null   object
 2   Synopsis    1566 non-null   object
 3   Tag

## Preprocess the Data

In [25]:


# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation and special characters
    return text

# Apply cleaning to the 'Synopsis' column
df['clean_synopsis'] = df['Synopsis'].apply(clean_text)

# Encode the 'Tag' column into numerical labels
le = LabelEncoder()

df['tag_encoded'] = le.fit_transform(df['Tag'])

if not os.path.exists('../model'):
    os.makedirs('../model')
joblib.dump(le, '../model/le.pkl')



['../model/le.pkl']

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Synopsis,Tag,clean_synopsis,tag_encoded
0,0,I tre volti della paura,Note: this synopsis is for the orginal Italian...,cult,note this synopsis is for the orginal italian ...,0
1,1,Mitt liv som hund,The action takes place in the years 1958-1959 ...,cult,the action takes place in the years 19581959 i...,0
2,2,The Brood,"At the Somafree Institute, Dr. Hal Raglan humi...",cult,at the somafree institute dr hal raglan humili...,0
3,3,The Haunted,This creepy and scary story centers around The...,paranormal,this creepy and scary story centers around the...,2
4,4,The Frozen Ground,The film opens in an Anchorage motel room in 1...,dramatic,the film opens in an anchorage motel room in 1...,1


## Train-Test Split

In [27]:


X = df['clean_synopsis']
y = df['tag_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Vectorize Text (TF-IDF)

In [28]:
max_length = X.apply(lambda x: len(x.split())).max()
print("Maximum length (in words) of the texts:", max_length)

Maximum length (in words) of the texts: 8473


In [29]:


text_lengths = X.apply(lambda x: len(x.split()))
fig = px.histogram(text_lengths, nbins=50, title="Distribution of Text Lengths (in words)", labels={'value': 'Text Length (words)', 'count': 'Frequency'})
fig.show()

In [8]:
text_lengths.quantile(0.99)

np.float64(4731.099999999999)

In [30]:

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [31]:
# Save the vectorizer
if not os.path.exists('../model'):
    os.makedirs('../model')
joblib.dump(vectorizer, '../model/vectorizer.pkl')

['../model/vectorizer.pkl']

## Train a Classifier

In [18]:


# Define base models
logreg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
svc = LinearSVC(max_iter=2000, random_state=42)

# Fit base models
# logreg.fit(X_train_vec, y_train)
rf.fit(X_train_vec, y_train)
gb.fit(X_train_vec, y_train)
svc.fit(X_train_vec, y_train)

# Ensemble with soft voting (except LinearSVC, which does not support predict_proba, so use hard voting)
ensemble = VotingClassifier(
    estimators=[
        # ('logreg', logreg),
        ('rf', rf),
        ('gb', gb),
        ('svc', svc)
    ],
    voting='hard'
)
ensemble.fit(X_train_vec, y_train)





### Evaluate the Model

In [19]:

y_pred = ensemble.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=le.classes_))
# Decode labels to original class names
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

        cult       0.72      0.97      0.83       207
    dramatic       1.00      0.03      0.06        34
  paranormal       0.82      0.38      0.52        73

    accuracy                           0.73       314
   macro avg       0.85      0.46      0.47       314
weighted avg       0.77      0.73      0.67       314

              precision    recall  f1-score   support

        cult       0.72      0.97      0.83       207
    dramatic       1.00      0.03      0.06        34
  paranormal       0.82      0.38      0.52        73

    accuracy                           0.73       314
   macro avg       0.85      0.46      0.47       314
weighted avg       0.77      0.73      0.67       314



In [20]:

# Ensure the models directory exists
os.makedirs('../model', exist_ok=True)

# Save base models
# joblib.dump(logreg, '../model/logreg_model.pkl')
joblib.dump(rf, '../model/rf_model.pkl')
joblib.dump(gb, '../model/gb_model.pkl')
joblib.dump(svc, '../model/svc_model.pkl')
# Save ensemble model
joblib.dump(ensemble, '../model/ensemble_model.pkl')

['../models/ensemble_model.pkl']

## Build a Demo App (Gradio)

In [10]:

# lodad joblib
vectorizer = joblib.load('../model/vectorizer.pkl')
ensemble = joblib.load('../model/ensemble_model.pkl')
model = ensemble
def predict(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)
    return le.inverse_transform(pred)[0]

demo = gr.Interface(fn=predict, inputs="textbox", outputs="label", title="Text Classifier", description="Classifies into: cult, paranormal, dramatic")
demo.launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Using existing dataset file at: .gradio\flagged\dataset1.csv


In [34]:
X_test[125]



In [None]:


app = FastAPI()

class InputText(BaseModel):
    text: str

@app.post("/predict")
def get_prediction(input: InputText):
    cleaned = clean_text(input.text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)
    label = le.inverse_transform(pred)[0]
    return {"prediction": label}
