In [1]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

In [2]:
dir =  r'Data'
topic_list = ['money', 'fx', 'crude', 'grain','trade', 'interest', 'wheat', 'ship', 'corn', 'oil', 'dlr', 'gas', 'oilseed', 'supply', 'sugar', 'gnp', 'coffee','veg', 'gold', 'soybean', 'bop',
'livestock', 'cpi']

# saving list
docs = []
data = []
# count articles
cnt= 0 

# find topic+article pairs
for file in os.listdir(dir):
    if file.split('.')[-1]=='sgm':
        path = os.path.join(dir,file)
        with open(path,'rb') as fp:
            soup = BeautifulSoup(fp,"lxml")
        for tr in soup.find_all('reuters'):
            for topic in tr.topics.children:
                if topic:
                    if topic.string in topic_list:
                        content =  tr.find('text').contents[-1]
                        if (topic.string, content) not in docs:
                            docs.append((topic.string, content))
                        if content not in data:
                            data.append(content)
                            cnt+=1
                    else:
                        split = topic.string.split('-')
                        for s in split:
                            if s in topic_list:
                                content = tr.find('text').contents[-1]
                                if (s,content) not in docs:
                                    docs.append((s, content))
                                if content not in data:
                                    data.append(content)
                                    cnt+=1                 
print("Number of articles:", cnt)
print("Number of topic-article pairs: ", len(docs))

Number of articles: 3666
Number of topic-article pairs:  6252


In [3]:
# stop words
stop_words = stopwords.words('english')

# tokenize function
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in stop_words] 
    tokens = list(map(lambda token: PorterStemmer().stem(token), words));
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, tokens));
    return filtered_tokens

documents = []
spark_data = []
# get filtered docs
for pairs in docs:
    text = tokenize(pairs[1])
    if text:
        spark_data.append((pairs[0], text))
        documents.append((pairs[0], " ".join(text)))
print("Number of tokized topic-article pairs: ", len(documents))

Number of tokized topic-article pairs:  6235


In [4]:
# print ten topic-body pairs
for i in range(10):
    print(documents[i])

('grain', 'u.s. agricultur depart report farmer-own reserv nation five-day averag price februari follow dlrs/bu-sorghum cwt natl loan releas call avg rate-x level price price wheat corn rate natl loan releas call avg rate-x level price price oat barley n.a sorghum reserv iii matur level reflect grain enter oct feedgrain juli wheat level wheat/barley corn/sorghum level cover wheat enter januari x-1986 rate y-dlr per cwt n.a.-not avail reuter')
('wheat', 'u.s. agricultur depart report farmer-own reserv nation five-day averag price februari follow dlrs/bu-sorghum cwt natl loan releas call avg rate-x level price price wheat corn rate natl loan releas call avg rate-x level price price oat barley n.a sorghum reserv iii matur level reflect grain enter oct feedgrain juli wheat level wheat/barley corn/sorghum level cover wheat enter januari x-1986 rate y-dlr per cwt n.a.-not avail reuter')
('corn', 'u.s. agricultur depart report farmer-own reserv nation five-day averag price februari follow dlr

In [5]:
# traning data saving path
training_data = r'training_test_data.txt'

# write to file
with open(training_data,'w') as f:
    for document in documents:
        topic = document[0]
        article=document[1]
        f.write(topic+ ', '+article+'\n')

In [7]:
# tf-idf without spark
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
import time
df = pd.DataFrame(documents, columns=['topic', 'tokens'])
# training_test_data X and Y without Spark
start = time.time()
vect = TfidfVectorizer()
X = vect.fit_transform(df['tokens'])
#print(time.time()-start)   # calculate time 

# convert categories to labels
from sklearn import preprocessing 
label = preprocessing.LabelEncoder()
Y = label.fit_transform(df['topic'])


# naive bayes tranining 
X_train,X_validate,Y_train,Y_validate  = train_test_split(X,Y, test_size = 0.1, random_state=1234)

# funtion to get the average score with runing model ten times
def ave_score(split):
    score = 0
    for i in range(10):
        x_train, x_test, y_train, y_test = train_test_split(X_train,Y_train, test_size = split)
        nb = MultinomialNB()
        nb.fit(x_train,y_train)
        predict = nb.predict(x_test)
        score += accuracy_score(predict, y_test)
    return score/10   

# split the data 50/40
print("The average time for split data 50/40:", ave_score(4/9))
# split the data 60/30
print("The average time for split data 60/30:", ave_score(3/9))
# split the data 70/20
print("The average time for split data 70/20:", ave_score(2/9))                  
# the more the training data, the big the score.

The average time for split data 50/40: 0.342742582197
The average time for split data 60/30: 0.342757883485
The average time for split data 70/20: 0.343303929431


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time
df = pd.DataFrame(documents, columns=['topic', 'tokens'])
# tf-idf without spark
start = time.time()
vect = TfidfVectorizer()
X = vect.fit_transform(df['tokens'])
print("The tf-idf without spark:", time.time()-start)

The tf-idf without spark: 0.5179998874664307


In [7]:
# convert categories to labels
df = pd.DataFrame(documents, columns=['topic', 'articles'])
from sklearn import preprocessing 
label = preprocessing.LabelEncoder()
df['topic'] = label.fit_transform(df['topic'])
df.to_csv('data.csv', index=False, header=True)

In [1]:
# data frame
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, NGram, ChiSqSelector
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
import pyspark.sql.functions

df = spark.read.csv("data.csv",header=True,inferSchema=True)
# tokenize words
#tokenizer = Tokenizer(inputCol="articles", outputCol="tokens")
#df = tokenizer.transform(df)
split_col = pyspark.sql.functions.split(df['articles'], ' ')
df = df.withColumn("body", split_col)
df.show()
df.printSchema()

+-----+--------------------+--------------------+
|topic|            articles|                body|
+-----+--------------------+--------------------+
|   10|u.s. agricultur d...|[u.s., agricultur...|
|   22|u.s. agricultur d...|[u.s., agricultur...|
|    2|u.s. agricultur d...|[u.s., agricultur...|
|   21|argentin grain bo...|[argentin, grain,...|
|   14|argentin grain bo...|[argentin, grain,...|
|   17|argentin grain bo...|[argentin, grain,...|
|   15|argentin grain bo...|[argentin, grain,...|
|    2|argentin grain bo...|[argentin, grain,...|
|   10|argentin grain bo...|[argentin, grain,...|
|   22|argentin grain bo...|[argentin, grain,...|
|   22|commod credit cor...|[commod, credit, ...|
|   10|commod credit cor...|[commod, credit, ...|
|   13|      blah blah blah|  [blah, blah, blah]|
|   19|      blah blah blah|  [blah, blah, blah]|
|    1|intern coffe orga...|[intern, coffe, o...|
|   16|mclean industri i...|[mclean, industri...|
|   18|sugar import subj...|[sugar, import, s...|


In [2]:
#tfidf with spark
import time
start =time.time()
hashingTF = HashingTF(inputCol="body", outputCol="term_freq")
df = hashingTF.transform(df)
idf = IDF(inputCol="term_freq", outputCol="tfidf", minDocFreq=5)
idfModel = idf.fit(df)
df = idfModel.transform(df)
print("The tf-idf with spark:", time.time()-start)

The tf-idf with spark: 1.124000072479248


In [3]:
#test train split
train,validate = df.select("tfidf","topic").randomSplit([0.9, 0.1],seed=1234)
print("train samples:", train.count())
print("test samples:",validate.count())

output  = r'model/'
#apply naive bayes function 
def Bayes(split):
    ave = 0
    for i in range(10):
        train_data, test_data = train.select("tfidf","topic").randomSplit([split/90, 1-split/90]) 
        #apply naive bayes
        nb = NaiveBayes(featuresCol="tfidf", labelCol="topic", predictionCol="NB_pred",
                        probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
        nbModel = nb.fit(train_data)
        test = nbModel.transform(test_data)
        #test.show()
        #get test accuracy
        total = test.count()
        correct = test.where(test['topic'] == test['NB_pred']).count()
        print("naive bayes unigrams test accuracy:", correct/total)
        ave+=correct/total
        # save model
        nbModel.save(output +'model' + '_'+str(split)+ '_'+str(i))
    print("The average model accuracy: ", ave/10)

train samples: 5625
test samples: 610


In [4]:
# split 70/20
Bayes(70)

naive bayes unigrams test accuracy: 0.37220843672456577
naive bayes unigrams test accuracy: 0.33965125094768767
naive bayes unigrams test accuracy: 0.3776167471819646
naive bayes unigrams test accuracy: 0.3557312252964427
naive bayes unigrams test accuracy: 0.35753749013417524
naive bayes unigrams test accuracy: 0.3692679002413516
naive bayes unigrams test accuracy: 0.3745082612116444
naive bayes unigrams test accuracy: 0.36726703210649964
naive bayes unigrams test accuracy: 0.35004042037186744
naive bayes unigrams test accuracy: 0.36313291139240506
The average model accuracy:  0.36269616756086037


In [4]:
# split 60/30
Bayes(60)

naive bayes unigrams test accuracy: 0.35400105988341285
naive bayes unigrams test accuracy: 0.35771920387305
naive bayes unigrams test accuracy: 0.35281501340482574
naive bayes unigrams test accuracy: 0.37567567567567567
naive bayes unigrams test accuracy: 0.3520381154049762
naive bayes unigrams test accuracy: 0.3662486938349007
naive bayes unigrams test accuracy: 0.3605333333333333
naive bayes unigrams test accuracy: 0.36015325670498083
naive bayes unigrams test accuracy: 0.38064859117490696
naive bayes unigrams test accuracy: 0.3629160063391442
The average model accuracy:  0.36227489496292065


In [4]:
# split 50/40
Bayes(50)

naive bayes unigrams test accuracy: 0.35282176207876575
naive bayes unigrams test accuracy: 0.35119280406726633
naive bayes unigrams test accuracy: 0.35140482785912147
naive bayes unigrams test accuracy: 0.3578990228013029
naive bayes unigrams test accuracy: 0.3679052674561045
naive bayes unigrams test accuracy: 0.3637826961770624
naive bayes unigrams test accuracy: 0.3627760252365931
naive bayes unigrams test accuracy: 0.3582443653618031
naive bayes unigrams test accuracy: 0.3702970297029703
naive bayes unigrams test accuracy: 0.35053235053235055
The average model accuracy:  0.358685615127334


In [4]:
# load model to estimate the validate data
model_list = os.listdir(output)
for file in model_list[:3]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_50_0: 0.3557377049180328
model/model_50_1: 0.3295081967213115
model/model_50_2: 0.3377049180327869


In [6]:
for file in model_list[3:6]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_50_3: 0.34918032786885245
model/model_50_4: 0.3475409836065574
model/model_50_5: 0.3360655737704918


In [7]:
for file in model_list[6:9]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_50_6: 0.3377049180327869
model/model_50_7: 0.34590163934426227
model/model_50_8: 0.3377049180327869


In [5]:
for file in model_list[9:12]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_50_9: 0.35081967213114756
model/model_60_0: 0.3475409836065574
model/model_60_1: 0.3344262295081967


In [6]:
for file in model_list[12:15]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_60_2: 0.3295081967213115
model/model_60_3: 0.3524590163934426
model/model_60_4: 0.33114754098360655


In [7]:
for file in model_list[15:18]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_60_5: 0.34590163934426227
model/model_60_6: 0.34098360655737703
model/model_60_7: 0.3262295081967213


In [8]:
for file in model_list[18:21]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_60_8: 0.34590163934426227
model/model_60_9: 0.32459016393442625
model/model_70_0: 0.32295081967213113


In [9]:
for file in model_list[21:24]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_70_1: 0.32459016393442625
model/model_70_2: 0.319672131147541
model/model_70_3: 0.33934426229508197


In [10]:
model_list = os.listdir(output)
for file in model_list[24:27]:
    path = os.path.join(output, file)
    #print(path)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_70_4: 0.3360655737704918
model/model_70_5: 0.33278688524590166
model/model_70_6: 0.33114754098360655


In [6]:
for file in model_list[27:30]:
    path = os.path.join(output, file)
    model = NaiveBayesModel.load(path)
    val = model.transform(validate)
    #get test accuracy
    total = val.count()
    correct = val.where(val['topic'] == val['NB_pred']).count()
    print(path + ':', correct/total)

model/model_70_7: 0.34098360655737703
model/model_70_8: 0.3262295081967213
model/model_70_9: 0.33934426229508197
