In [61]:
import pandas as pd
from preprocess import PreProcess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import html
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

In [3]:
df = pd.read_csv('bbc_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Document,Category
0,"b""Pernod takeover talk lifts Domecq\n\nShares ...",business
1,b'Qwest may spark MCI bidding war\n\nUS phone ...,business
2,b'Dogged Federer claims Dubai crown\n\nWorld n...,sport
3,b'Oscars race enters final furlong\n\nThe race...,entertainment
4,"b""Hearts of Oak 3-2 Cotonsport\n\nHearts of Oa...",sport


In [8]:
def read_process_data(path):
    data = pd.read_csv(path)
    column_name = data.columns[0]
    # print(column_name)
    pre_processor = PreProcess(data, column_name)
    # todo: change code to provide all functions in class definition.
    data = pre_processor.clean_html()
    data = pre_processor.remove_non_ascii()
    data = pre_processor.remove_spaces()
    data = pre_processor.remove_punctuation()
    data = pre_processor.stemming()
    data = pre_processor.lemmatization()
    data = pre_processor.stop_words()
    train_x, test_x, train_y, test_y = train_test_split(data.Document, data.Category, test_size=0.20)
    tfidf_transformer = TfidfVectorizer(min_df=1)
    train_vectors = tfidf_transformer.fit_transform(train_x)
    return train_vectors, train_y

In [10]:
x,y = read_process_data('bbc_dataset.csv')

In [18]:
# x.todense(),y

In [89]:
def remove_non_ascii(df):
    df = df.apply(lambda x: "".join(i for i in x if ord(i) < 128))
    return df
class PreProcess(BaseEstimator, TransformerMixin):
    """
    This class contains all text pre-processing function
    # Input parameters: Dataframe, Column_name on which function needs to be applied
    # Output parameters: Return dataframe after applying operations
    """
    # todo: Pass functions as a list of arguments to apply in the class
    # todo: make set of words before applying all operations to reduce processing time.
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatiser = WordNetLemmatizer()
        # pass

    def clean_html(df):
        """remove html entities"""
        df = df.apply(html.unescape)
        return df

    def remove_spaces(df):
        df = df.apply(lambda x: x.replace('\n', ' '))
        df = df.apply(lambda x: x.replace('\t', ' '))
        df = df.apply(lambda x: x.replace('  ', ' '))
        df = df.apply(lambda x: x.lower())
        return df

    def remove_punctuation(self):
        tr = str.maketrans("", "", string.punctuation)
        # self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([item.translate(tr)
        #                                                                 for item in x.split()]))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: x.translate(tr))
        return self.data

    def stemming(self):
        # todo: provide option of selecting stemmer.
        snowball_stemmer = SnowballStemmer('english')
        # self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([snowball_stemmer.stem(item)
        #                                                                 for item in x.split()]))
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([self.stemmer.stem(item)
                                                                        for item in x.split()]))
        return self.data

    def lemmatization(self):
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join([self.lemmatiser.lemmatize(item)
                                                                        for item in x.split()]))
        return self.data

    def stop_words(self):
        stop = stopwords.words('english')
        self.data[self.column_name] = self.data[self.column_name].apply(lambda x: " ".join(set([item for item in x.split() if
                                                                                       item not in stop])))
        return self.data
    
    def transform(self, df):
        df = remove_non_ascii(df)
        df = clean_html(df)
        return df
        
    def fit(self, df, y=None, **fit_params):
        return self

In [90]:
df.columns[0]

'Document'

In [91]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom Pre-Processing estimator for our use-case
    """

    def __init__(self):
        pass

    def transform(self, df):
        """
        Preprocessing steps for text processing
        """
#         columns = df.columns()
        df = df.apply(lambda x: x.lower())
        df = df.apply(lambda x: "".join(i for i in x if ord(i) < 128))
        df = df.apply(html.unescape)
        return df

    def fit(self, df, y=None, **fit_params):
        return self

In [92]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

In [93]:
pipe = Pipeline([
    ('Test1', PreProcess())
])

In [94]:
xx =pipe.fit_transform(df.Document)

In [95]:
xx

0       b"Pernod takeover talk lifts Domecq\n\nShares ...
1       b'Qwest may spark MCI bidding war\n\nUS phone ...
2       b'Dogged Federer claims Dubai crown\n\nWorld n...
3       b'Oscars race enters final furlong\n\nThe race...
4       b"Hearts of Oak 3-2 Cotonsport\n\nHearts of Oa...
5       b'U2 stars enter rock Hall of Fame\n\nSinger B...
6       b'News Corp eyes video games market\n\nNews Co...
7       b'Lewsey puzzle over disallowed try\n\nEngland...
8       b'Giggs handed Wales leading role\n\nRyan Gigg...
9       b'India power shares jump on debut\n\nShares i...
10      b'Ericsson sees earnings improve\n\nTelecoms e...
11      b'Robinson wants dual code success\n\nEngland ...
12      b'Fit-again Betsen in France squad\n\nFrance h...
13      b'Serena ends Sania Mirza\'s dream\n\nSania Mi...
14      b'Fockers fuel festive film chart\n\nComedy Me...
15      b'Pop band Busted to \'take a break\'\n\nChart...
16      b'Hingis to make unexpected return\n\nMartina ...
17      b'Chin

In [44]:
df.Document

0       b"Pernod takeover talk lifts Domecq\n\nShares ...
1       b'Qwest may spark MCI bidding war\n\nUS phone ...
2       b'Dogged Federer claims Dubai crown\n\nWorld n...
3       b'Oscars race enters final furlong\n\nThe race...
4       b"Hearts of Oak 3-2 Cotonsport\n\nHearts of Oa...
5       b'U2 stars enter rock Hall of Fame\n\nSinger B...
6       b'News Corp eyes video games market\n\nNews Co...
7       b'Lewsey puzzle over disallowed try\n\nEngland...
8       b'Giggs handed Wales leading role\n\nRyan Gigg...
9       b'India power shares jump on debut\n\nShares i...
10      b'Ericsson sees earnings improve\n\nTelecoms e...
11      b'Robinson wants dual code success\n\nEngland ...
12      b'Fit-again Betsen in France squad\n\nFrance h...
13      b'Serena ends Sania Mirza\'s dream\n\nSania Mi...
14      b'Fockers fuel festive film chart\n\nComedy Me...
15      b'Pop band Busted to \'take a break\'\n\nChart...
16      b'Hingis to make unexpected return\n\nMartina ...
17      b'Chin