In [None]:
!pip install -U pip setuptools wheel
!pip install -U textstat
!pip install textblob
!pip install --upgrade scikit-learn

In [1]:
import re
import numpy as np 
import pandas as pd
import textstat
import string
import nltk
from textblob import TextBlob

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from nltk.corpus import stopwords
from sklearn.semi_supervised import SelfTrainingClassifier, LabelSpreading
from sklearn.linear_model import SGDClassifier

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Fetch Data

In [2]:
news_group = fetch_20newsgroups(subset='train')
news_group_data = news_group.data
news_group_target_names = news_group.target_names
news_group_target = news_group.target

news_group_test = fetch_20newsgroups(subset='test')
news_group_test_data = news_group_test.data
news_group_test_target_names = news_group_test.target_names
news_group_test_target = news_group_test.target

# Convert to Pandas DF and Random Sampling

In [3]:
news_df = pd.DataFrame({'news': news_group_data, 
                        'class': news_group_target})

news_sampled = news_df.sample(2000)
news_sampled.reset_index(drop=True, inplace=True)

news_df_test = pd.DataFrame({'news': news_group_test_data, 
                        'class': news_group_test_target})

news_sampled_test = news_df_test.sample(400)
news_sampled_test.reset_index(drop=True, inplace=True)

# Cleaning Text

*   Cleaning
*   Removing stop words



In [4]:
class Cleaner():

    def __init__(self):
        self.stop_words = stopwords.words('english')
        self.re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
        self.re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')

    def clean_news(self, text):
        text = re.sub(r'(From:\s+[^\n]+\n)', '', text) # remove From
        text = re.sub(r'(Subject:)', '', text) # remove the word "Subject:""
        text = text.lower() # Convert to lowerCase
        text = text.strip() # Strip terminal spaces
        text = re.sub(self.re_url, '', text)
        text = re.sub(self.re_email, '', text)       
        text = re.sub(r'\s+\w{1}\s+', ' ', text) #remove single char
        #text = text.replace('\n',' ')
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text) # punctuations
        text = re.sub(r'^\d+\s|\s\d+\s|\s\d+$', ' ', text) # remove pure digits
        text = re.sub(r'(\s+)', ' ', text) # replace >1 whitespaces with single space

        return text

    def removeStopWords(self, text):
        
        x = text.split(' ')
        for word in x:
            if(word in self.stop_words):
                x = list(filter((word).__ne__, x))
        return ' '.join(x)

    def fit(self, x, y=None):
        return self
    def transform(self, data):
        cleaner = Cleaner()
        
        data_array = []
        for d in data:
            s = cleaner.clean_news(d)
            w = cleaner.removeStopWords(s)
            data_array.append(w)
        return data_array 

# BOW Vectorizer

In [5]:
class BOWVectorizer():
    def __init__(self):
        self.vectorize = None
    def fit(self, x, y=None):
        bowvec = TfidfVectorizer()
        bowvec.fit(x)
        self.vectorize = bowvec
        return self.vectorize
    
    def transform(self, data):
        x = self.vectorize.transform(data)
        return x

# POS Tagging

In [6]:
class POSVectorizer():
    def __init__(self):
        pass

    def creatingPOSTags(self, x):
             
        pos_family = {'NOUN' : ['NN','NNS','NNP','NNPS'],
                    'PRON' : ['PRP','PRP$','WP','WP$'], 
                    'VERB' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
                    'ADJ'  : ['JJ','JJR','JJS'],
                    'ADV'  : ['RB','RBR','RBS','WRB']
                    }
            
        count_pos = {'NOUN':0,'PRON':0,'VERB':0,'ADJ':0,'ADV':0}
        
        blob  = TextBlob(x) #converts sentences to tokens
        for tuple in blob.tags: #blob tags contains term and its pos
            #print(tuple)
            pos = list(tuple)[1]
            if pos in pos_family['NOUN']:
                count_pos['NOUN'] = count_pos.get('NOUN')+1
            elif pos in pos_family['PRON']:
                count_pos['PRON'] = count_pos.get('PRON')+1
            elif pos in pos_family['VERB']:
                count_pos['VERB'] = count_pos.get('VERB')+1
            elif pos in pos_family['ADJ']:
                count_pos['ADJ'] = count_pos.get('ADJ')+1
            elif pos in pos_family['ADV']:
                count_pos['ADV'] = count_pos.get('ADV')+1
        return count_pos 
    
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        posVector = POSVectorizer()
        pos_vect = []
        for d in data:
            pos_vect.append(posVector.creatingPOSTags(d))
        return pos_vect

# Convert toArray()

In [7]:
class ToArray():

    def transform(self, X):
        return X.toarray()

    def fit(self, X, y=None, **fit_params):        
        return self

# Creating Pipelines

In [8]:
bow_transformer = Pipeline(
    steps=[
        ("cleaner", Cleaner()),
        ("bow", BOWVectorizer()),
        ("toarray", ToArray()), #converting toarray since minmax can't handle sparce matrix
        ("scale", preprocessing.MinMaxScaler())
    ]
)


pos_transformer = Pipeline(
    steps=[
        ("cleaner", Cleaner()),
        ("pos", POSVectorizer()),
        ("dict_vect", DictVectorizer()),
        ("toarray", ToArray()), #converting toarray since minmax can't handle sparce matrix
        ("scale", preprocessing.MinMaxScaler())
    ]
)

combined_features = FeatureUnion(
    transformer_list=[
        ("bow", bow_transformer),
        ("pos", pos_transformer),
    ]
)

def fitFinalPipeline(classifier, X_train, Y_train, X_test, Y_test):
    final_pipeline = Pipeline(
        steps=[
            ("combined_features", combined_features),
            ('chi',  SelectKBest(chi2, k=20000)),
            ("classifier", classifier),
        ]
    )

    final_pipeline.fit(X_train, Y_train)
    y_pred = final_pipeline.predict(X_test)
    cr = classification_report(Y_test, y_pred)
    print(cr)

# RandomForest

In [9]:
fitFinalPipeline (RandomForestClassifier(), news_sampled['news'], news_sampled['class'], news_sampled_test['news'], news_sampled_test['class'])

              precision    recall  f1-score   support

           0       0.67      0.78      0.72        18
           1       0.76      0.76      0.76        17
           2       0.49      0.86      0.62        22
           3       0.61      0.58      0.59        19
           4       0.52      0.62      0.57        21
           5       0.89      0.64      0.74        25
           6       0.81      0.68      0.74        19
           7       0.75      0.86      0.80        21
           8       0.81      0.81      0.81        21
           9       0.59      0.72      0.65        18
          10       0.88      1.00      0.94        22
          11       0.75      0.90      0.82        10
          12       0.88      0.30      0.45        23
          13       0.50      0.65      0.57        20
          14       0.96      0.80      0.87        30
          15       0.70      0.91      0.79        23
          16       0.70      0.89      0.78        18
          17       1.00    

# SGD

In [10]:
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')

fitFinalPipeline (SGDClassifier(**sdg_params), news_sampled['news'], news_sampled['class'], news_sampled_test['news'], news_sampled_test['class'])

              precision    recall  f1-score   support

           0       0.80      0.44      0.57        18
           1       0.65      0.65      0.65        17
           2       0.46      0.82      0.59        22
           3       0.56      0.26      0.36        19
           4       0.47      0.67      0.55        21
           5       0.94      0.60      0.73        25
           6       0.71      0.63      0.67        19
           7       0.83      0.71      0.77        21
           8       0.76      0.76      0.76        21
           9       0.63      0.67      0.65        18
          10       0.84      0.73      0.78        22
          11       0.86      0.60      0.71        10
          12       0.61      0.48      0.54        23
          13       0.75      0.45      0.56        20
          14       0.90      0.90      0.90        30
          15       0.72      0.57      0.63        23
          16       0.37      0.83      0.51        18
          17       0.54    

# Mask

In [11]:
y_mask = np.random.rand(len(news_sampled['class'])) < 0.2
y_masked_class = news_sampled
y_masked_class['class'][~y_mask] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# LabelSpreading

In [12]:
fitFinalPipeline (LabelSpreading(gamma=0.25, max_iter=5), news_sampled['news'], y_masked_class['class'], news_sampled_test['news'], news_sampled_test['class'])



              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.00      0.00      0.00        17
           2       0.06      1.00      0.11        22
           3       1.00      0.05      0.10        19
           4       0.00      0.00      0.00        21
           5       1.00      0.04      0.08        25
           6       0.00      0.00      0.00        19
           7       0.00      0.00      0.00        21
           8       0.00      0.00      0.00        21
           9       0.00      0.00      0.00        18
          10       0.00      0.00      0.00        22
          11       0.00      0.00      0.00        10
          12       0.00      0.00      0.00        23
          13       0.00      0.00      0.00        20
          14       0.00      0.00      0.00        30
          15       0.00      0.00      0.00        23
          16       0.00      0.00      0.00        18
          17       0.00    

  probabilities /= normalizer
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SelfTrainingClassifier

In [13]:
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
fitFinalPipeline (SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True), news_sampled['news'], y_masked_class['class'], news_sampled_test['news'], news_sampled_test['class'])

End of iteration 1, added 1458 new labels.
End of iteration 2, added 127 new labels.
End of iteration 3, added 16 new labels.
End of iteration 4, added 1 new labels.
End of iteration 5, added 1 new labels.
              precision    recall  f1-score   support

           0       0.50      0.50      0.50        18
           1       0.83      0.29      0.43        17
           2       0.52      0.64      0.57        22
           3       1.00      0.11      0.19        19
           4       0.36      0.19      0.25        21
           5       0.43      0.64      0.52        25
           6       0.60      0.63      0.62        19
           7       0.50      0.52      0.51        21
           8       0.68      0.62      0.65        21
           9       0.62      0.56      0.59        18
          10       0.54      0.68      0.60        22
          11       0.36      0.50      0.42        10
          12       0.75      0.13      0.22        23
          13       0.13      0.65    