## Author
- Selim Lakhdar
    - selim.lakhdar@gmail.com
    - selim.lakhdar.etu@univ-lille.fr
------------------------------

## Import Libraries

In [1]:
from bs4 import BeautifulSoup,SoupStrainer

import os
import time
import re
import pandas as pd
import numpy as np

import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [2]:
random_state = 42

## Reuters: Topic Prediction

### Load Data

In [3]:
def data_loader():
    l_topics, l_title, l_body = [], [], []

    for i in range(22):
        # 17 file contains error
        if i == 17:
            continue

        file_name = f'../data/reuteurs21578/reut2-{i:03d}.sgm'
        print(file_name)
        f = open(file_name, 'r')
        data = f.read()

        soup = BeautifulSoup(data, "html.parser")
        contents = soup.findAll('reuters')

        topic_err, title_err, body_err = 0, 0, 0

        for content in contents:
            # topics
            try:
                topics = content.findAll("topics")[0].findAll("d")
                tmp_topics = []
                for topic in topics:
                    tmp_topics += [topic.text]

                if len(tmp_topics) == 0:
                    topic_err += 1
                    l_topics += [['Unkown']]
                else:
                    l_topics += [tmp_topics]
            except:
                topic_err += 1
                l_topics += ['Unkown']
            # title
            try:
                title = content.findAll("title")[0].text
                l_title += [title]
            except:
                title_err += 1
                l_title += ['Unkown']
            # body
            try:        
                body = content.findAll("body")[0].text
                l_body += [body]
            except:
                body_err += 1
                l_body += ['Unkown']

        print(i, "l_topics:", len(l_topics), "topic_err:", topic_err)
        print(i, "l_title:", len(l_title), "title_err:", title_err)
        print(i, "l_body:", len(l_body), "body_err:", body_err)
        print("===================================")

    reuteurs_df = pd.DataFrame()
    reuteurs_df['topics'] = l_topics
    reuteurs_df['title'] = l_title
    reuteurs_df['body'] = l_body
    return reuteurs_df

In [4]:
reuteurs_df = data_loader()
reuteurs_df

../data/reuteurs21578/reut2-000.sgm
0 l_topics: 1000 topic_err: 488
0 l_title: 1000 title_err: 15
0 l_body: 1000 body_err: 75
../data/reuteurs21578/reut2-001.sgm
1 l_topics: 2000 topic_err: 476
1 l_title: 2000 title_err: 5
1 l_body: 2000 body_err: 70
../data/reuteurs21578/reut2-002.sgm
2 l_topics: 3000 topic_err: 436
2 l_title: 3000 title_err: 3
2 l_body: 3000 body_err: 94
../data/reuteurs21578/reut2-003.sgm
3 l_topics: 4000 topic_err: 409
3 l_title: 4000 title_err: 1
3 l_body: 4000 body_err: 74
../data/reuteurs21578/reut2-004.sgm
4 l_topics: 5000 topic_err: 431
4 l_title: 5000 title_err: 5
4 l_body: 5000 body_err: 103
../data/reuteurs21578/reut2-005.sgm
5 l_topics: 6000 topic_err: 387
5 l_title: 6000 title_err: 3
5 l_body: 6000 body_err: 76
../data/reuteurs21578/reut2-006.sgm
6 l_topics: 7000 topic_err: 382
6 l_title: 7000 title_err: 2
6 l_body: 7000 body_err: 79
../data/reuteurs21578/reut2-007.sgm
7 l_topics: 8000 topic_err: 412
7 l_title: 8000 title_err: 8
7 l_body: 8000 body_err: 1

Unnamed: 0,topics,title,body
0,[cocoa],BAHIA COCOA REVIEW,Showers continued throughout the week in\nthe ...
1,[Unkown],STANDARD OIL <SRD> TO FORM FINANCIAL UNIT,Standard Oil Co and BP North America\nInc said...
2,[Unkown],TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN,Texas Commerce Bancshares Inc's Texas\nCommerc...
3,[Unkown],TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER,BankAmerica Corp is not under\npressure to act...
4,"[grain, wheat, corn, barley, oat, sorghum]",NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,The U.S. Agriculture Department\nreported the ...
...,...,...,...
20573,[ship],JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHARGES,The Japan/India-Pakistan-Gulf/Japan\nshipping ...
20574,[ipi],SOVIET INDUSTRIAL GROWTH/TRADE SLOWER IN 1987,The Soviet Union's industrial output is\ngrowi...
20575,[gold],SIX KILLED IN SOUTH AFRICAN GOLD MINE ACCIDENT,Six black miners have been killed\nand two inj...
20576,[Unkown],PROJECTIONS SHOW SWISS VOTERS WANT TRIED PARTIES,The prospect of a dominant alliance of\nsocial...


### Topic Counter: Extract TOP 6

In [5]:
topic_counter = {}
for topics in reuteurs_df['topics']:
    for t in topics:
        if t in topic_counter.keys():
            topic_counter[t] += 1
        else:
            topic_counter[t] = 1

pd.DataFrame.from_dict(topic_counter, orient='index', columns=['freq']).sort_values(['freq'], ascending=False)[:10]

Unnamed: 0,freq
Unkown,9801
earn,3939
acq,2293
money-fx,726
grain,593
crude,579
trade,503
interest,479
wheat,288
ship,286


In [6]:
best_topics = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade']

In [7]:
l = []
for index, row in reuteurs_df.iterrows():
    if len(row['topics']) == 1:
        for t in row['topics']:
            if t in best_topics:
                # from array to elem
                row[0] = row[0][0]
                l += [row]
                
top_topics_reuteurs_df = pd.DataFrame(l)
top_topics_reuteurs_df

Unnamed: 0,topics,title,body
8,earn,CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT,Champion Products Inc said its\nboard of direc...
9,acq,COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE,Computer Terminal Systems Inc said\nit has com...
10,earn,COBANCO INC <CBCO> YEAR NET,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,..."
12,earn,AM INTERNATIONAL INC <AM> 2ND QTR JAN 31,Oper shr loss two cts vs profit seven cts\n ...
13,earn,BROWN-FORMAN INC <BFD> 4TH QTR NET,Shr one dlr vs 73 cts\n Net 12.6 mln vs 15....
...,...,...,...
20554,acq,CABLE AND WIRELESS TO MERGE TWO H.K. UNITS INT...,Unkown
20555,money-fx,BALLADUR INSISTS ON MAINTENANCE OF LOUVRE ACCORDS,French Finance Minister Edouard Balladur\nissu...
20561,trade,PHILIPPINE TRADE GAP WIDENS IN JANUARY-AUGUST,The Philippines' trade deficit widened to\n542...
20567,crude,"IRAN, SOVIET UNION TO SWAP CRUDE, REFINED PROD...",The Soviet Union has agreed to supply\nIran wi...


### Preprocessor: Text Cleaning

In [8]:
def preprocess_text_clean(msg):
    msg = msg.replace('\n', ' ')
    msg = msg.replace('\r', ' ')
    msg = msg.replace('Reuter\x03', ' ')
    msg = msg.replace('REUTER\x03', ' ')
    msg = msg.replace('Reuter \x03', ' ')
    
    # remove <tag>
    #msg = re.sub(r'<(?<=<).*?(?=>)>', ' ', msg)
    msg = msg.replace('<', ' ')
    msg = msg.replace('>', ' ')
    
    # remove  -
    msg = msg.replace(' - ', ' ')
    
    # reformat  ,
    msg = msg.replace(' , ', ', ')
    
    # join tokens
    msg = ' '.join(msg.split())
    
    return msg

### Tokenzier: Spacy

In [9]:
nlp = spacy.load('en_core_web_lg')

def lemma_tokenizer(sentence):
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stopwords.words('english')]
    return tokens

### Bag of Words (CountVectorizer)

In [10]:
vectorizer = CountVectorizer(lowercase=True, 
                             preprocessor=preprocess_text_clean, 
                             tokenizer=lemma_tokenizer, 
                             ngram_range=(1,3),
                            max_features=50000)

### Split

In [11]:
X = top_topics_reuteurs_df['body']
y = top_topics_reuteurs_df['topics'].astype("category").cat.codes

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

In [13]:
start = time.time()
X_train = vectorizer.fit_transform(X_train)
end = time.time()
print('elapsed time:', end - start)

elapsed time: 123.78480052947998


In [14]:
print("Shape: ",X_train.shape)
vectorizer.vocabulary_

Shape:  (5018, 50000)


{'TransCanada': 21771,
 'PipeLines': 19915,
 'Ltd': 18215,
 'deny': 28560,
 'report': 41300,
 'raise': 40523,
 'offer': 37532,
 'Dome': 15159,
 'Petroleum': 19822,
 'DMP': 14820,
 '5.5': 9992,
 'billion': 25381,
 'canadian': 25952,
 'dlrs': 29089,
 '4.3': 9505,
 '.': 3304,
 'Canadian': 13726,
 'Broadcasting': 13274,
 'Corp': 14622,
 '(': 921,
 'CBC': 13452,
 ')': 1000,
 'late': 33761,
 'last': 33642,
 'night': 37297,
 'say': 42767,
 "'s": 558,
 'new': 37173,
 'bid': 25264,
 'reject': 41065,
 ',': 1041,
 'accept': 22898,
 '5.1': 9965,
 'dlr': 29007,
 'Amoco': 12063,
 '"': 1,
 'still': 46296,
 'want': 49187,
 'acquire': 23056,
 'asset': 24683,
 'prepared': 39234,
 'negotiate': 37047,
 'however': 32397,
 'present': 39240,
 'proposal': 39993,
 'company': 26942,
 'since': 45554,
 'April': 12155,
 '16': 6436,
 'refuse': 40991,
 'date': 28268,
 'president': 39250,
 'Gerald': 16398,
 'consist': 27447,
 'two': 47670,
 'part': 38308,
 '--': 3295,
 '4.5': 9524,
 'cash': 26104,
 'security': 44014,

### Multinomial Naive Bayes

In [15]:
clf = MultinomialNB()
X_test = vectorizer.transform(X_test)

In [16]:
start = time.time()
pred = clf.fit(X_train, y_train).predict(X_test)
end = time.time()
print('elapsed time:', end - start)

elapsed time: 0.06474757194519043


In [17]:
# MACRO: Calculate metrics for each label, and find their unweighted mean. 
# This does not take label imbalance into account.
# MICRO: Calculate metrics globally by counting the total true positives, false negatives and false positives.
# WEIGHTED: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). 
# This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.

def print_score(y_pred, y_test):
    print('f1 macro score:', f1_score(y_test,pred, average='macro'))
    print('f1 micro score:', f1_score(y_test,pred, average='micro'))
    print('f1 weighted score:', f1_score(y_test,pred, average='weighted'))
    print("----------------------")
    print('recall macro score:', recall_score(y_test,pred, average='macro'))
    print('recall micro score:', recall_score(y_test,pred, average='micro'))
    print('recall weighted score:', recall_score(y_test,pred, average='weighted'))
    print("----------------------")
    print('precision macro score:', precision_score(y_test,pred, average='macro'))
    print('precision micro score:', precision_score(y_test,pred, average='micro'))
    print('precision weighted score:', precision_score(y_test,pred, average='weighted'))
    # print("----------------------")
    # print('roc_auc_score macro score:', roc_auc_score(y_test,pred, average='macro', multi_class='ovr'))
    # print('roc_auc_score micro score:', roc_auc_score(y_test,pred, average='micro'))
    # print('roc_auc_score weighted score:', roc_auc_score(y_test,pred, average='weighted'))
    

In [18]:
print_score(pred, y_test)

f1 macro score: 0.7333948555137733
f1 micro score: 0.898651789865179
f1 weighted score: 0.896095136500183
----------------------
recall macro score: 0.7248364130114443
recall micro score: 0.898651789865179
recall weighted score: 0.898651789865179
----------------------
precision macro score: 0.7475823484580061
precision micro score: 0.898651789865179
precision weighted score: 0.8951212782108448


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


------------------------------------------------------
# Remarques
- On remaque que nous avons un bon f1 score ce qui indique que l'ont a pas trop de marges entre les classes. (Class Inbalance)
- On remarque un bon score de recall.
    - ```Recall is a metric that quantifies the number of correct positive predictions made out of all positive predictions that could have been made```
- On remarque un bon score de precision. 
    - ```The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.```
    - Le clf diférencie bien les topics

## Amazon: Sentiment Analysis

### Load Data

In [19]:
for root, _, files  in os.walk('../data/amazon/processed_acl/books/'):
    for f in files:
        file = root + f
        print('open', file)
        fd = open(file, 'r')
        for line in fd.readlines():
            print(line)
            break

open ../data/amazon/processed_acl/books/positive.review
holes:1 must:1 top_secret:1 he:1 center:1 other_civilans:1 the_pacific:1 the_navy:1 a_lot:1 surface_must:1 this_book:1 man_named:1 <num>_feet:2 would_strongly:1 put_down:1 norman_johnson:1 lawes:1 a_top:1 the_support:1 ten:1 on_random:1 typhoon:1 a_phycologist:1 pressure:1 actually_an:1 a_day:1 johnson_is:1 strange:1 civilans_to:1 explored:1 support:1 pacific_ocean:1 pressure_to:1 back_some:1 read:1 however_on:1 it_still:1 stuck:1 a_remote:1 american:1 find_out:1 not_have:1 other_crichton:1 analysis.i:1 black_holes:1 after_a:1 misssion:1 some_strange:1 half_mile:1 <num>_navy:1 actually:1 remote_location:1 the_story:1 behavior:1 civilans:2 research_that:1 michael_crichton:1 excellant_novel:1 strongly_recommend:1 some:2 michael:1 does_not:1 strange_things:1 sphere_by:1 a_man:1 are_joined:1 mile_long:1 put:1 operations:1 spacecraft:2 around:1 joined:1 an_american:1 sea:1 a_half:1 pacific:1 the_civilans:1 information_on:1 day_under:1 

In [78]:
rows = []

for root_topics, dirs_topic, _  in os.walk('../data/amazon/processed_acl/'):
    for topic in dirs_topic:
        for root_topic, _, files_topic in os.walk(root_topics + topic):
            for file in files_topic:
                print(topic, file)
                file = root_topic + "/" + file
                print('open', file)
                fd = open(file, 'r')
                for line in fd.readlines():
                    
                    # remove count
                    line = re.sub(r':\d', '', line)
                    
                    tokens = line.split()
                    sentence = ' '.join(tokens[:-1])
                    target = tokens[-1:]
                    if "positive" in str(target):
                        target = 1
                    elif "negative" in str(target):
                        target = 0
                    else:
                        print("error !!!!!", target)
                    
                    rows += [[topic, sentence, target]]
print(len(rows))

books positive.review
open ../data/amazon/processed_acl/books/positive.review
books negative.review
open ../data/amazon/processed_acl/books/negative.review
books unlabeled.review
open ../data/amazon/processed_acl/books/unlabeled.review
kitchen positive.review
open ../data/amazon/processed_acl/kitchen/positive.review
kitchen negative.review
open ../data/amazon/processed_acl/kitchen/negative.review
kitchen unlabeled.review
open ../data/amazon/processed_acl/kitchen/unlabeled.review
dvd positive.review
open ../data/amazon/processed_acl/dvd/positive.review
dvd negative.review
open ../data/amazon/processed_acl/dvd/negative.review
dvd unlabeled.review
open ../data/amazon/processed_acl/dvd/unlabeled.review
electronics positive.review
open ../data/amazon/processed_acl/electronics/positive.review
electronics negative.review
open ../data/amazon/processed_acl/electronics/negative.review
electronics unlabeled.review
open ../data/amazon/processed_acl/electronics/unlabeled.review
27677


In [79]:
amazon_df = pd.DataFrame(rows, columns=['topic', 'sentence', 'target'])
amazon_df

Unnamed: 0,topic,sentence,target
0,books,holes must top_secret he center other_civilans...,1
1,books,i_think dr_dean reason oz medicine_which stori...,1
2,books,woman_the contains_the fan_i alex_ross(superma...,1
3,books,hurricane these_pages lost_innocence both at_h...,1
4,books,while commented the_rise if strong_emphasis fo...,1
...,...,...,...
27672,electronics,came tighter front_of fit_tighter make bent_th...,1
27673,electronics,advice nothing_about code buying_a <num>_month...,0
27674,electronics,and_used peers_during her month_trip remote_ch...,1
27675,electronics,deal_with angriest_is <dash-num> me_angriest a...,0


In [80]:
amazon_df['topic'].value_counts()

kitchen        7945
electronics    7681
books          6465
dvd            5586
Name: topic, dtype: int64

In [81]:
amazon_df['target'].value_counts()

1    13882
0    13795
Name: target, dtype: int64

In [82]:
vectorizer2 = CountVectorizer(max_features=50000)

In [107]:
X2 = amazon_df['sentence']
y2 = amazon_df['target']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=random_state)

In [108]:
start = time.time()
X_train2 = vectorizer2.fit_transform(X_train2)
end = time.time()
print('elapsed time:', end - start)
print("X_train2.shape", X_train2.shape)

elapsed time: 4.827284336090088
X_train2.shape (19373, 50000)


In [109]:
X_test2 = vectorizer2.transform(X_test2)
print("X_test2.shape", X_test2.shape)

X_test2.shape (8304, 50000)


In [110]:
start = time.time()
pred2 = clf.fit(X_train2, y_train2).predict(X_test2)
end = time.time()
print('elapsed time:', end - start)

elapsed time: 0.04524874687194824


In [112]:
print('f1 macro score:', f1_score(y_test2,pred2, average='macro'))
print('f1 micro score:', f1_score(y_test2,pred2, average='micro'))
print('f1 weighted score:', f1_score(y_test2,pred2, average='weighted'))
print("----------------------")
print('recall macro score:', recall_score(y_test2,pred2, average='macro'))
print('recall micro score:', recall_score(y_test2,pred2, average='micro'))
print('recall weighted score:', recall_score(y_test2,pred2, average='weighted'))
print("----------------------")
print('precision macro score:', precision_score(y_test2,pred2, average='macro'))
print('precision micro score:', precision_score(y_test2,pred2, average='micro'))
print('precision weighted score:', precision_score(y_test2,pred2, average='weighted'))

f1 macro score: 0.885595809333078
f1 micro score: 0.8855973025048169
f1 weighted score: 0.8855956102435127
----------------------
recall macro score: 0.8855992482206148
recall micro score: 0.8855973025048169
recall weighted score: 0.8855973025048169
----------------------
precision macro score: 0.8856220646382892
precision micro score: 0.8855973025048169
precision weighted score: 0.8856236122716312


------------------------------------------------------
# Remarques
- On remarque de très bon score. Le clf arrive à bien distinguer les classes.
- La représentation en BOW est efficace.