<a href="https://colab.research.google.com/github/alexoliveros92/cross_validation/blob/main/Cross_Validation_Text_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC     
from sklearn.naive_bayes import MultinomialNB   

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Reading dataset as dataframe
df = pd.read_csv("News.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window
df['Category'] = df['Category'].map({'Sport':1, 'Sci/Tech':0})

In [None]:
df

Unnamed: 0,Category,Summary,cleaned_summary
0,0,"Reuters - A group of technology companies\including Texas Instruments Inc. (TXN.N), STMicroelectronics\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\will propose a new wireless networking standard up to 10 times\the speed of the current generation.",reuters group technology company including texas instrument inc txn n stmicroelectronics stm pa broadcom corp brcm thursday said propose new wireless networking standard time speed current generation
1,0,Reuters - America Online on Thursday said it\plans to sell a low-priced PC targeting low-income and minority\households who agree to sign up for a year of dialup Internet\service.,reuters america online thursday said plan sell low priced pc targeting low income minority household agree sign year dialup internet service
2,0,"Reuters - A group of consumer electronics\makers said on Wednesday they approved the format for a new\generation of discs that can store five times the data of DVDs\at the same cost -- enough to put a full season of ""The\Sopranos"" on one disc.",reuters group consumer electronics maker said wednesday approved format new generation disc store five time data dvd cost enough put full season soprano one disc
3,0,Reuters - The mystery of what went wrong for the\software industry in late June when sales stalled at more than\20 brand-name companies is not even close to being solved\although the third quarter is nearly halfway over.,reuters mystery went wrong software industry late june sale stalled brand name company even close solved although third quarter nearly halfway
4,0,AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again #151; this time breaking the locks on Apple Computer Inc.'s wireless music streaming technology.,ap norwegian hacker famed developing dvd encryption cracking software apparently struck time breaking lock apple computer inc wireless music streaming technology
...,...,...,...
52319,1,AP - Police believe they know the identity of the man who threw a chair in a brawl between players and fans during the Indiana-Detroit game on Nov. 19.,ap police believe know identity man threw chair brawl player fan indiana detroit game nov
52320,1,"AP - Catcher Damian Miller's #36;8.75 million, three-year contract with the Milwaukee Brewers was finalized Monday after team doctors allayed concerns over his health.",ap catcher damian miller million three year contract milwaukee brewer finalized monday team doctor allayed concern health
52321,1,AP - The 14-year-old son of NBC Sports chairman Dick Ebersol was presumed dead Monday after a fiery jet crash that killed two crewmen and left the injured executive and another of his sons begging bystanders for help.,ap year old son nbc sport chairman dick ebersol presumed dead monday fiery jet crash killed two crewman left injured executive another son begging bystander help
52322,1,El-Hadji Diouf is expected to receive a three-match ban for spitting in Arjan de Zeeuw #39;s face during Bolton #39;s Premiership match against Portsmouth on Saturday.,el hadji diouf expected receive three match ban spitting arjan de zeeuw face bolton premiership match portsmouth saturday


In [None]:
# Cleaning summaries
def cleaner(summary):
    soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(#|@|http://|https://|www)\S*", " ", souped) # substituting hashtags, @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]   
    return lemmas

df['cleaned_summary'] = df.Summary.apply(cleaner)
df = df[df['cleaned_summary'].map(len) > 0] # removing rows with cleaned summaries of length 0
print("Printing top 5 rows of dataframe showing original and cleaned summaries....")
print(df[['Summary','cleaned_summary']].head())

df['cleaned_summary'] = [" ".join(row) for row in df['cleaned_summary'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_summary']
Y = df['Category'] # target column
tfidf = TfidfVectorizer(min_df=.0005, ngram_range=(1,3)) # min_df=.0005 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (60000*.0005=30). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
print("The created tokens: \n", tfidf.get_feature_names())
print("Shape of tfidf matrix: ", data_tfidf.shape)


Printing top 5 rows of dataframe showing original and cleaned summaries....
                                                                                                                                                                                                                                                                  Summary  \
0  Reuters - A group of technology companies\including Texas Instruments Inc. (TXN.N), STMicroelectronics\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\will propose a new wireless networking standard up to 10 times\the speed of the current generation.   
1                                                                                     Reuters - America Online on Thursday said it\plans to sell a low-priced PC targeting low-income and minority\households who agree to sign up for a year of dialup Internet\service.   
2                     Reuters - A group of consumer electronics\makers said on Wednesday they approved the format for

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_summary'] = [" ".join(row) for row in df['cleaned_summary'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input


The created tokens: 
Shape of tfidf matrix:  (52317, 7031)




In [None]:
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)


Iteration  1
Cross-validation accuracy:  0.9864296636085627
Iteration  2
Cross-validation accuracy:  0.9843272171253823
Iteration  3
Cross-validation accuracy:  0.9818425076452599
Iteration  4
Cross-validation accuracy:  0.9875764525993884
Iteration  5
Cross-validation accuracy:  0.9841360856269113
Iteration  6
Cross-validation accuracy:  0.9858562691131498
Iteration  7
Cross-validation accuracy:  0.9850917431192661
Iteration  8
Cross-validation accuracy:  0.9870005735041101
Iteration  9
Cross-validation accuracy:  0.9856623972471803
Iteration  10
Cross-validation accuracy:  0.985471229210476
Mean cross-validation accuracy:  0.9853394138799688


In [None]:
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

In [None]:
# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Iteration  1
Cross-validation accuracy:  0.9822247706422018
Iteration  2
Cross-validation accuracy:  0.9820336391437309
Iteration  3
Cross-validation accuracy:  0.9789755351681957
Iteration  4
Cross-validation accuracy:  0.9845183486238532
Iteration  5
Cross-validation accuracy:  0.9827981651376146
Iteration  6
Cross-validation accuracy:  0.9831804281345565
Iteration  7
Cross-validation accuracy:  0.9837538226299695
Iteration  8
Cross-validation accuracy:  0.9852800611737718
Iteration  9
Cross-validation accuracy:  0.9806920282928694
Iteration  10
Cross-validation accuracy:  0.9831772127700249
Mean cross-validation accuracy:  0.9826634011716789
