In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd

In [None]:
sds_text = pd.read_csv('../data/sds_text.csv')

In [None]:
sds_text

In [None]:
sds_combined = pd.DataFrame(sds_text.groupby(['episode_number', 'guest_name', 'context_episode'])['episode_split_text'].agg(lambda x: ' '.join(x))).reset_index()

In [None]:
sds_combined.head()

In [None]:
nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)

In [None]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop
             and not token.is_digit
             and not token.is_punct
             and not token.is_space]
    return tokens

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [None]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [None]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = preprocess_text, ngram_range=(1,1)) 
classifier = LinearSVC()

In [None]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = preprocess_text)

In [None]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [None]:
# Features and Labels
X = sds_text['episode_split_text']
y = sds_text['context_episode']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
# Create the  pipeline to clean, tokenize, vectorize, and classify using"Count Vectorizor"
pipe_countvect = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])
# Fit our data
pipe_countvect.fit(X_train,y_train)


In [None]:
# Predicting with a test dataset
sample_prediction = pipe_countvect.predict(X_test)



In [None]:
# Predicting with a test dataset
sample_prediction = pipe_countvect.predict(X_test)


for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)
    
# Accuracy
print("Accuracy: ",pipe_countvect.score(X_test,y_test))
print("Accuracy: ",pipe_countvect.score(X_test,sample_prediction))
# Accuracy
print("Accuracy: ",pipe_countvect.score(X_train,y_train))

In [None]:
# Predicting with a test dataset
sample_prediction = pipe_countvect.predict(X_test)


for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)
    
# Accuracy
print("Accuracy: ",pipe_countvect.score(X_test,y_test))
print("Accuracy: ",pipe_countvect.score(X_test,sample_prediction))
# Accuracy
print("Accuracy: ",pipe_countvect.score(X_train,y_train))