__author__ = 'abhishekchoudhary'
# load csv file
from pyspark import SparkConf, SparkContext
from Util import Ready
import pandas as pd
import os
import nltk
import numpy as np
import csv
import re
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from random import shuffle
stopset = set(stopwords.words('english'))
class PrepareData(object):
def prepare(self):
filePath = "/Users/abhishekchoudhary/Work/python/training.1600000.processed.noemoticon.csv"
util = Ready()
df = util.readFile(filePath, header=["polarity", "userid", "date", "none", "username", "tweets"])
df = util.cleanDataFrame(df)
print df.head(5)
return df
def word_feature(self, words):
# stopwords filtering did improvise the accuracy , so I better keep it
features = dict([(word.lower(), True) for word in words if word.lower() not in stopset])
return features
def unigramAnalysis(self, word_extract_feature, df):
trainfeats = []
testfeats = []
dataset = []
features = []
for index, row in df.iterrows():
statement = row['tweets']
emotion = row['polarity']
features.append((word_extract_feature(statement.split()), emotion))
print("Appended all Dataset features")
cutoff = len(features) * 3 / 4
trainfeats = features[:cutoff]
testfeats = features[cutoff:]
print("Start Classifying or feature extraction")
classifier = NaiveBayesClassifier.train(trainfeats)
print("End of Main TraininG Classifiers")
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
#K-Fold classification test
#average the result of number of tests
print("Shuffling of entire Training Set")
print("Shuffling Done")
X_folds = np.array_split(trainfeats, K_FOLDS)
scores = list()
for k in range(K_FOLDS):
X_train = list(X_folds)
X_test = X_train.pop(k)
X_train = np.concatenate(X_train)
classifier_sub = NaiveBayesClassifier.train(X_train)
scores.append(nltk.classify.util.accuracy(classifier_sub, X_test))
print("K-Fold scores done ", scores)
for i, (feats, label) in enumerate(testfeats):
observed = classifier.classify(feats)
print 'Average accuracy K-Fold ', sum(scores) / float(len(scores))
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
print 'Happy precision:', nltk.metrics.precision(refsets['Happy'], testsets['Happy'])
print 'Happy recall:', nltk.metrics.recall(refsets['Happy'], testsets['Happy'])
print 'Sad precision:', nltk.metrics.precision(refsets['Sad'], testsets['Sad'])
print 'Sad recall:', nltk.metrics.recall(refsets['Sad'], testsets['Sad'])
except AttributeError, err:
print Exception, err
conf = (SparkConf().setMaster("yarn-client").setAppName("Long DataSet In YARN").set("spark.executor.memory", "1g"))
sc = SparkContext(conf=conf)
data = PrepareData()
df = data.prepare()
data.unigramAnalysis(data.word_feature, df)