In [2]:
# Imports
import pandas as pd 
import sklearn
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import tree
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2

from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support

import gensim, logging
from gensim.models import Word2Vec
from scipy import sparse

import re

ImportError: No module named 'nltk'

In [3]:
def loadData(filePath="dataset.csv"):
    deb1 = pd.read_csv('tweets_debate1.csv')
    deb2 = pd.read_csv('tweets_debate2.csv')
    deb3 = pd.read_csv('tweets_debate3.csv')
    debvp = pd.read_csv('tweets_debateVP.csv')
    frames = [deb1, deb2, deb3, debvp]
    df = pd.concat(frames, ignore_index=True)

    usersdf = pd.read_csv('users.csv')
    
    result = pd.merge(df, usersdf, on='userID')
    df = result.loc[:,['text', 'party']]
    
    df = train.dropna()     # drop obs with NaN

    return df["text"],df["party"]

def preProcessing(features, remove_stops=True):
    num_tweets = features.size
    clean_wordlist = []
    clean_tweets = []
    clean_hashtags = []
    if (remove_stops):
        stops = set(stopwords.words('english'))
        for i in range( 0, num_tweets):
            # Get hashtags
            try:
                tags = re.findall(r"#(\w+)", features[i].lower())
            except:
                tags = ""
            words = features[i].lower().split()
            words = [w.lower() for w in words if not w in stops]  
            clean_hashtags.append(tags)
            clean_wordlist.append(words)
            clean_tweets.append(" ".join(words))
    else:
        for i in range( 0, num_tweets):
            # Get hashtags
            tags = re.findall(r"#(\w+)", features[i].lower())
            words = features[i].lower().split()
            clean_hashtags.append(tags)
            clean_wordlist.append(words)
            clean_tweets.append(" ".join(words))
    return clean_tweets, clean_wordlist, clean_hashtags
    
def getDTMByTFIDF(features,nfeatures):
    tfIdf_vectorizer = TfidfVectorizer(max_features=nfeatures)
    dtm = tfIdf_vectorizer.fit_transform(features).toarray()
    return dtm,tfIdf_vectorizer
    
def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

def featuresByInformationGain(features,labels):
    treeCL = tree.DecisionTreeClassifier(criterion="entropy")
    treeCL = treeCL.fit(features,labels)
    transformed_features = SelectFromModel(treeCL,prefit=True).transform(features)
    return transformed_features

def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa
    
def featuresByLDA(features, labels):
    lda_model = LinearDiscriminantAnalysis()
    dtm = lda_model.fit_transform(features, labels) 
    return dtm, lda_model
    
def featuresByBagOfWords(features):
    vect = CountVectorizer()
    dtm = vect.fit_transform(features)
    return dtm

def featuresByBagOfWordsBiTri(features):
    vect = CountVectorizer(ngram_range=(2,3))
    dtm = vect.fit_transform(features)
    return dtm
    
    
def crossValidate(document_term_matrix,labels,classifier="SVM",nfold=2):
    clf = None
    precision = []
    recall = []
    fscore = []
    
    if classifier == "RF":
        clf = RandomForestClassifier()
    elif classifier == "NB":
        clf = MultinomialNB()
    elif classifier == "SVM":
        clf = LinearSVC()
    
    skf = StratifiedKFold(labels, n_folds=nfold)

    for train_index, test_index in skf:
        X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = clf.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precision.append(p)
        recall.append(r)
        fscore.append(f)
        
    return np.mean(precision),np.mean(recall),np.mean(fscore)

In [4]:
### main ###
tweets, labels = loadData()

processed_tweets, processed_tweets_wordlist, hashtags = preProcessing(tweets)

dtm,vect = getDTMByTFIDF(processed_tweets, None)
hashtagDtm, vect = getDTMByTFIDF(hashtags, None)

NameError: name 'train' is not defined