In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF, metrics
from sklearn_crfsuite import scorers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['CONTENT'], df['CLASS']

def word2features(word):
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    return features

def prepare_features(X):
    return [[word2features(word) for word in sentence.split()] for sentence in X]

def train_crf_classifier(X, y):
    # Note: This is a simplified example. In real scenarios, X and y need to be lists of lists
    # where each inner list represents a sequence (e.g., a sentence).
    
    X_features = prepare_features(X)
    # For simplicity, assume y is a list of labels for each sentence.
    # In a real CRF scenario, y would be a list of lists of labels for each word.
    
    # To use CRF, you would need to restructure y into a list of lists of labels.
    # Here's a placeholder for restructuring:
    sequences_y = [[label] for label in y]
    
    # However, since CRF expects labels for each word, this won't work directly.
    # Let's focus on the original task using MultinomialNB for text classification.
    
    # For CRF, you would need something like this, but it's not suitable for your data:
    # X_train, X_test, y_train, y_test = train_test_split(X_features, sequences_y, test_size=0.2, random_state=42)
    
    # Instead, let's use MultinomialNB for text classification:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return model

def predict_class(model, comment):
    return model.predict([comment])[0]

if __name__ == "__main__":
    file_path = "Youtube01-Psy.csv"  # Update with the correct path
    X, y = load_data(file_path)
    model = train_crf_classifier(X, y)

    # Test user input
    user_comment = input("Enter a comment: ")
    predicted_class = predict_class(model, user_comment)
    print(f"Predicted Class: {predicted_class}")
    


Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95        27
           1       0.98      0.95      0.96        43

    accuracy                           0.96        70
   macro avg       0.95      0.96      0.96        70
weighted avg       0.96      0.96      0.96        70



Enter a comment:  this is so good


Predicted Class: 0
