# When the last preprocessing step is stemming

In [6]:
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

data = pd.read_csv('youtube.csv', encoding='utf-8')
data = data.dropna(subset=['CONTENT'])

# Preprocessing
stemmer = PorterStemmer()
for i in range(len(data)):
    comment = data['CONTENT'][i].lower().strip()
    words = word_tokenize(comment)
    stemmed_words = [stemmer.stem(word) for word in words]
    data['CONTENT'][i] = ' '.join(stemmed_words)

# Tokenization
vocabulary = []
for comment in data['CONTENT']:
    words = nltk.tokenize.word_tokenize(comment)
    for word in words:
        vocabulary.append(word)

vocabulary = nltk.FreqDist(vocabulary)

features = [x[0] for x in vocabulary.most_common(700)]

vectors = []
for comment in data['CONTENT']:
    vector = {}
    words = nltk.tokenize.word_tokenize(comment)
    for word in features:
        vector[word] = True if word in words else False
        
    vectors.append(vector)

dataset = list(zip(vectors, data['CLASS']))

train_set, test_set = train_test_split(dataset, test_size=0.25, random_state=1)

nltk_model = SklearnClassifier(KNeighborsClassifier())
nltk_model.train(train_set)
accuracy = accuracy_score([y for (x, y) in test_set], nltk_model.classify_many(x for (x, y) in test_set))
print("Model accuracy: {}".format(accuracy))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ansar9811291\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  youtube['CONTENT'][i] = ' '.join(k)


model Accuracy: 0.8522727272727273


# When the last preprocessing step is lemmetiztion

In [7]:
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

data = pd.read_csv('youtube.csv', encoding='utf-8')
data = data.dropna(subset=['CONTENT'])

# Preprocessing
lemmatizer = WordNetLemmatizer()
for i in range(len(data)):
    comment = data['CONTENT'][i].lower().strip()
    words = word_tokenize(comment)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    data['CONTENT'][i] = ' '.join(lemmatized_words)

# Tokenization
vocabulary = []
for comment in data['CONTENT']:
    words = nltk.tokenize.word_tokenize(comment)
    for word in words:
        vocabulary.append(word)

vocabulary = nltk.FreqDist(vocabulary)

features = [x[0] for x in vocabulary.most_common(700)]

vectors = []
for comment in data['CONTENT']:
    vector = {}
    words = nltk.tokenize.word_tokenize(comment)
    for word in features:
        vector[word] = True if word in words else False
        
    vectors.append(vector)

dataset = list(zip(vectors, data['CLASS']))

train_set, test_set = train_test_split(dataset, test_size=0.25, random_state=1)

nltk_model = SklearnClassifier(KNeighborsClassifier())
nltk_model.train(train_set)
accuracy = accuracy_score([y for (x, y) in test_set], nltk_model.classify_many(x for (x, y) in test_set))
print("Model accuracy: {}".format(accuracy))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ansar9811291\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ansar9811291\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CONTENT'][i] = ' '.join(lemmatized_words)


Model accuracy: 0.8409090909090909


# When the last preprocessing step is error correction


In [None]:
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import words
nltk.download('punkt')
nltk.download('words')

# Get a set of English words for spell checking
english_words = set(words.words())

data = pd.read_csv('youtube.csv', encoding='utf-8')
data = data.dropna(subset=['CONTENT'])

# Preprocessing
for i in range(len(data)):
    comment = data['CONTENT'][i].lower().strip()
    words = word_tokenize(comment)
    
    # Perform spell correction
    corrected_words = []
    for w in words:
        if w in english_words:
            corrected_words.append(w)
        else:
            # If a word is not in the English dictionary, consider it a misspelling and replace it
            # with the most similar English word
            max_similarity = -1
            corrected_word = w
            for ew in english_words:
                similarity = nltk.edit_distance(w, ew)
                if similarity > max_similarity:
                    max_similarity = similarity
                    corrected_word = ew
            corrected_words.append(corrected_word)
    
    data['CONTENT'][i] = ' '.join(corrected_words)

# Tokenization
vocabulary = []
for comment in data['CONTENT']:
    words = nltk.tokenize.word_tokenize(comment)
    for word in words:
        vocabulary.append(word)

vocabulary = nltk.FreqDist(vocabulary)

features = [x[0] for x in vocabulary.most_common(700)]

vectors = []
for comment in data['CONTENT']:
    vector = {}
    words = nltk.tokenize.word_tokenize(comment)
    for word in features:
        vector[word] = True if word in words else False
        
    vectors.append(vector)

dataset = list(zip(vectors, data['CLASS']))

train_set, test_set = train_test_split(dataset, test_size=0.25, random_state=1)

nltk_model = SklearnClassifier(KNeighborsClassifier())
nltk_model.train(train_set)
accuracy = accuracy_score([y for (x, y) in test_set], nltk_model.classify_many(x for (x, y) in test_set))
print("Model accuracy: {}".format(accuracy))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ansar9811291\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Ansar9811291\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CONTENT'][i] = ' '.join(corrected_words)
