In [53]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 
from sklearn.svm import NuSVC 


<h4 style="color:green">Preprocessing</h4>

In [54]:
#remove ponctuation and special chars and numbers...
def clean_str(string):
    
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [55]:
#applying  clean_str on each word in each article
def preprocess(x):
    for index,value in enumerate(x):
        x[index] = ' '.join([word for word in clean_str(value).split()])
    return x

<h4 style="color:green">Features extraction</h4>

In [56]:
def features_extract(x):
    vect = TfidfVectorizer(stop_words='english',min_df=4)
    X = vect.fit_transform(x)
    return vect,X


<h4 style="color:green">Building and evaulating the model</h4>

In [31]:

def fitting(X, y):
    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Fit the model
    model = NuSVC(kernel='rbf', probability=True)
    clf = model.fit(X_train, y_train)

    return X_train, X_test, y_train, y_test, model,clf


In [57]:
def evaluate(X_train,y_train,X_test,y_test,model,target_names=None):
    
    # Predict class labels on a test data
    pred_labels_te = model.predict(X_test)
   
    return classification_report(y_test, pred_labels_te,target_names=target_names)
    

<h4 style="color:red">BBC dataset</h4>

In [58]:
data = pd.read_csv('../dataset/dataset.csv',encoding="cp1252")
x = data['news'].tolist()
y = data['type'].tolist()

x = preprocess(x)
BBC_vectorizer,X = features_extract(x)
Y = np.array(y)

X_train, X_test, y_train, y_test, BBC_model,clf = fitting(X, y)

BBC_results = evaluate(X_train, y_train,X_test, y_test, BBC_model)

<h4 style="color:red">20 news groups dataset</h4>

In [59]:
from sklearn.datasets import fetch_20newsgroups
categories = [ 'misc.forsale','comp.graphics','soc.religion.christian','talk.politics.guns','rec.sport.baseball']
newsgroups_data = fetch_20newsgroups(subset="all",categories=categories)
target_names = newsgroups_data.target_names

x = newsgroups_data['data']
y = newsgroups_data['target']

x = preprocess(x)
newsgroups_vectorizer,X = features_extract(x)
Y = np.array(y)

X_train, X_test, y_train, y_test, newsgroups_model,clf = fitting(X, y)

newsgroups_results = evaluate(X_train, y_train ,X_test, y_test, newsgroups_model,target_names)

<h4 style="color:green">saving the model and the vectorizers</h4>

In [38]:
import pickle
pickle.dump(BBC_model,open("BBC_model.sav",'xb'))
pickle.dump(newsgroups_model,open("newsgroups_model.sav",'xb'))
pickle.dump(BBC_vectorizer,open("BBC_vectorizer.sav",'xb'))
pickle.dump(newsgroups_vectorizer,open("newsgroups_vectorizer.sav",'xb'))

<h4 style="color:green">Results</h4>

In [60]:
print('-------------------BBC Model------------------------')
print(BBC_results)
print('-------------------20Newsgroup Model------------------------')
print(newsgroups_results)

-------------------BBC Model------------------------
               precision    recall  f1-score   support

     business       0.96      0.97      0.96       111
entertainment       0.97      0.97      0.97        74
     politics       0.96      0.95      0.96        86
        sport       1.00      1.00      1.00       108
         tech       0.97      0.95      0.96        66

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

-------------------20Newsgroup Model------------------------
                        precision    recall  f1-score   support

         comp.graphics       0.94      0.99      0.96       195
          misc.forsale       0.95      0.98      0.97       187
    rec.sport.baseball       0.99      0.97      0.98       209
soc.religion.christian       1.00      0.94      0.97       203
    talk.politics.guns       1.00      0.98      0.99       176

  

<h4 style="color:green">Testing a new article</h4>

In [42]:
def check_news_type(news_article,vect,model,target_names = None):  
    article = list(news_article)
    article = preprocess(article)
    features = vect.transform(article)
    prediction = model.predict(features)[0]
    category = prediction if target_names is None else target_names[prediction]
    return str(category) 

In [49]:
bbc_model = pickle.load(open("BBC_model.sav",'rb'))
bbc_vect = pickle.load(open("BBC_vectorizer.sav",'rb'))
group20new_model = pickle.load(open("newsgroups_model.sav",'rb'))
group20new_vect = pickle.load(open("newsgroups_vectorizer.sav",'rb'))


In [48]:
article = """Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (?600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding.

Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.

TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake."""

In [50]:
check_news_type(article,bbc_vect,bbc_model)

'business'

In [52]:
check_news_type(article,group20new_vect,group20new_model,target_names)

'comp.graphics'