In [None]:
import langid
import re
import pickle
import config
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords  # for using english stopwords
from gensim.models.phrases import Phrases
from sqlalchemy import create_engine
from nltk.stem.wordnet import WordNetLemmatizer
from tweetf0rm.handler.oracle_handler import OracleHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from matplotlib import pyplot as plt
from gensim.utils import deaccent, decode_htmlentities, lemmatize


def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(my_tags))
    target_names = my_tags
    plt.xticks(tick_marks, target_names, rotation=90)
    plt.yticks(tick_marks, target_names)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
def evaluate_prediction(predictions, target, title="Confusion matrix"):
    print('accuracy %s' % accuracy_score(target, predictions))
    cm = confusion_matrix(target, predictions)
    print('confusion matrix\n %s' % cm)
    print('(row=expected, col=predicted)')
    
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(cm_normalized, title + ' Normalized')
    
def most_influential_words(clf, vectorizer, category_index=0, num_words=10):
    features = vectorizer.get_feature_names()
    max_coef = sorted(enumerate(clf.coef_[category_index]), key=lambda x:x[1], reverse=True)
    return [features[x[0]] for x in max_coef[:num_words]]    

def remove_url(documents):
    return [(doc[0],re.sub(r"(?:\@|http?\://)\S+", "", doc[1])) for doc in documents]

# def filter_lang(lang, documents):
#     return (doc for doc in documents if langid.classify(doc[1])[0] == lang)

def filter_lang(text,lang='en'):
    return  langid.classify(text)[0] == lang

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def clean_tweet(tweet):
    tweet = re.sub(r"(?:\@|https|http?\://)\S+", "", tweet) # remove urls
    tweet = " ".join([i for i in tweet.lower().split() if i not in stops]) #tokenize and remove stop words
    tweet = ''.join(ch for ch in tweet if ch not in exclude)# remove more words
    tweet = " ".join(lemma.lemmatize(word) for word in tweet.split())
    return tweet

def get_similar_word_correlation(train_texts)
    dictionay = Dictionary(tweet_processed)

    correlation_matrix = scipy.sparse.identity(len(vocab), format="dok")
    for tokens in dictionay.values():
        similar_words = []
        try:
            similar_words = [x[0] for x in w2vmodel.most_similar(tokens.lower(), topn=5) if x[1] > 0.5]
        except:
    #         raise
            pass
        for similar_word in similar_words:
            if similar_word in vocab:
                correlation_matrix[dictionay.token2id[word], dictionay.token2id[similar_word]] = 1

#         term_frequency_vector += term_frequency_vector * correlation_matrix

# def allcaps(text):
#     text = text.group()
#     return text.lower() + " <allcaps> "


# def preprocess_tweet2(text):
#     # Different regex parts for smiley faces
#     eyes = r"[8:=;]"
#     nose = r"['`\-]?"

#     # function so code less repetitive
#     def re_sub(pattern, repl):
#         return re.sub(pattern, repl, text, flags=FLAGS)

#     text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
#     text = re_sub(r"/"," / ")
#     text = re_sub(r"@\w+", "<user>")
#     text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
#     text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
#     text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
#     text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
#     text = re_sub(r"<3","<heart>")
#     text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", " <number> ")
#     text = re_sub(r"#\S+", hashtag)
#     text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
#     text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

#     ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
#     # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
#     text = re_sub(r"([A-Z]){2,}", allcaps)
    
#     text = ''.join(ch for ch in text if ch not in exclude)  # Remove punctuation

#     return text.lower()

def preprocess_text(tweet):
    """
    Function to process an aggregated user profile. This does the following:
    1. Decode html entities. eg. "AT&amp;T" will become "AT&T"
    2. Deaccent
    3. Remove links.
    4. Remove any user mentions (@name).
    5. Lemmatize and remove stopwords.
    
    Parameters:
    ----------
    text : String. If train_texts is a list of tweets, ' '.join and pass
    
    Returns:
    -------
    text : preprocessed (tokenized) tweet.
    """
    tweet = re.sub(r"(?:\@|https|http?\://)\S+", "", tweet) # remove urls
    tweet = " ".join([i for i in tweet.lower().split() if i not in stops]) #tokenize and remove stop words
    tweet = ''.join(ch for ch in tweet if ch not in exclude)# remove more words
    
    tweet = decode_htmlentities(tweet)
    tweet = deaccent(tweet)
    tweet = tweet.encode('ascii', 'ignore')  # To prevent UnicodeDecodeErrors later on
#     tweet = re.sub(r'http\S+', '', str(tweet))  # Step 3
    tweet = re.sub(r'@\w+', '', tweet)  # Step 4
    tweet = tweet.split()
    tweet = lemmatize(' '.join(tweet), re.compile('(NN)'), stopwords=stops, min_length=3, max_length=15)
    tweet = [word.split('/')[0] for word in tweet]
    return tweet


def get_dataframes(pycon_dict):
    """
    Function to get train and test dataframes (without any preprocessing).
    
    Parameters:
    ----------
    pycon_dict: The twitter user dictionary being used.
    
    Returns:
    -------
    train, test: Train and test dataframes.
    """
    train = pd.DataFrame(columns=columns)
    test = pd.DataFrame(columns=columns)
    
    for category in pycon_dict:
        for entity in pycon_dict[category]:
            train_texts = []
            test_texts = []
            num_texts = len(pycon_dict[category][entity])  # To get number of tweets
            train_indices = np.random.choice(num_texts, int(0.9 * num_texts), replace=False)  # Random selection
            test_indices = [i for i in range(num_texts) if i not in train_indices]  # Rest go into test set
            train_texts.extend(pycon_dict[category][entity][i].text for i in train_indices)  # Add to train texts
            test_texts.extend(pycon_dict[category][entity][i].text for i in test_indices)  # Add to test texts
            #### Create train dataframe ####
            train_texts = ' '.join(train_texts)
            df_train = pd.DataFrame([[train_texts, categories_map[category], category]], columns=columns)
            train = train.append(df_train, ignore_index=True)
            #### Create test dataframe ####
            test_texts = ' '.join(test_texts)
            df_test = pd.DataFrame([[test_texts, categories_map[category], category]], columns=columns)
            test = test.append(df_test, ignore_index=True)
            
    return train, test

def predicit_categroies(df, bigram, clf):
    predict_texts = [bigram[message] for message in  df['processed_text']]
    predict_texts_features = count_vectorizer.transform(' '.join(text) for text in predict_texts)
    return clf.predict(predict_texts_features)

db = OracleHandler()
db_eng = create_engine(config.SQLALCHEMY_DATABASE_URI, encoding='utf8')
stops = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
columns = ['message', 'category_id', 'category']
categories_map = {0: u'Business & CEOs',
                  1: u'Music',
                  2: u'Entertainment',
                  3: u'Fashion, Travel & Lifestyle',
                  4: u'Sports',
                  5: u'Tech',
                  6: u'Politics',
                  7: u'Science',
                  u'Business & CEOs': 0,
                  u'Entertainment': 2,
                  u'Fashion, Travel & Lifestyle': 3,
                  u'Music': 1,
                  u'Politics': 6,
                  u'Science': 7,
                  u'Sports': 4,
                  u'Tech': 5}

In [11]:
# create the trainging data 
pycon_dict = pickle.load(open("pycon_dict.pkl", "rb"))
## or ###
def getTweets(category_dict, category): 
    """ Function to get the tweets for each handle in the dictionary in the particular category. Parameters: ---------- category_dict: User category dictionary consisting of categories and user handles. category: String. Name of the category. Returns: ------- category_dict: Dictionary with the most recent 200 tweets of all user handles. """ 
    for handle in category_dict[category]: 
        category_dict[category][handle] = api.GetUserTimeline(screen_name=handle, count=200) 
        return category_dict


my_tags = pycon_dict.keys()
train, test = get_dataframes(pycon_dict)
train_texts = train['message'].apply(preprocess_text)
train_categories = train['category_id'].astype(int)
bigram = Phrases(train_texts)# For collocation detection
train_texts = [bigram[profile] for profile in train_texts]

#for the test data
test_texts = test['message'].apply(preprocess_text)
test_texts = [bigram[message] for message in test_texts]

count_vectorizer = CountVectorizer(max_features=5000)
train_count_features = count_vectorizer.fit_transform(' '.join(text) for text in train_texts)
# similar_word_correlation_matrix =  get_similar_word_correlation(train_texts)
# train_count_features = train_count_features * similar_word_correlation_matrix
clf_model = LogisticRegression()
clf_model = clf_model.fit(train_count_features, train_categories)
test_count_features = count_vectorizer.transform(' '.join(text) for text in test_texts)
predictions = clf_model.predict(test_count_features)
%matplotlib inline
# evaluate_prediction(predictions, test['category_id'])
# train_count_features.shape

In [12]:
sql = """
        select user_handle, text
        from tweets_timeline
        where lang in ('en', 'und') 
        --and user_handle = 'osayamenomigie'
        and user_handle in  ('osayamenomigie','Princeolaoluwa', 'Focusj3')
        """
df = pd.read_sql_query(sql, db_eng)
df2 = df.copy()
df2 = df2[df2.apply(lambda row: filter_lang(row['text']), axis=1)] #filter for english
# df2 = df2.groupby(['user_handle'])['text'].apply(lambda x: ','.join(x)).reset_index()
# df2['text'] = df2['text'].map(clean_tweet)# remove urls
df2['processed_text'] = df2['text'].apply(preprocess_text)
final_predictions = predicit_categroies(df2, bigram, clf_model)

In [13]:
df2['prediction'] = [categories_map[i] for i in final_predictions]
# df2.groupby(['user_handle', 'prediction']).agg(['count'])
# df2.groupby(['user_handle', 'prediction']).count()
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 1000)
df2.head(10)

Unnamed: 0,user_handle,text,processed_text,prediction
0,Princeolaoluwa,"@_oribz @Healthertainmet just keep praying, pr...","[prayer, lot]",Entertainment
1,Princeolaoluwa,"@honorable creations, i give your fabric a tou...","[creation, fabric, touch, life, hotline]",Science
2,Princeolaoluwa,https://t.co/8OlpAK6DTv https://t.co/8OlpAK6DT...,[facebook],Tech
3,Princeolaoluwa,Myself and Love https://t.co/p1u75AAn2X,[],"Fashion, Travel & Lifestyle"
4,Princeolaoluwa,#ITS NOT TOO LATE# 9JA FOR LIFE,[life],Science
5,Princeolaoluwa,Check out Camera + - Selfies + in BlackBerry W...,"[check, camera, blackberry, world]",Tech
6,Princeolaoluwa,Visit http://t.co/GrCXzAwBq1 to register for a...,"[visit, register, security, card, today, treat...",Entertainment
7,Princeolaoluwa,Dunno what has gotten into me.. I'm loving it!!,[],"Fashion, Travel & Lifestyle"
10,Princeolaoluwa,C004B2F6B its just getting better!,[],"Fashion, Travel & Lifestyle"
11,Princeolaoluwa,Finally!!!! http://t.co/XYpzwGhCOU,[],"Fashion, Travel & Lifestyle"


In [7]:
df2.groupby(['user_handle', 'prediction']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,processed_text
user_handle,prediction,Unnamed: 2_level_1,Unnamed: 3_level_1
Focusj3,Business & CEOs,5,5
Focusj3,Entertainment,3,3
Focusj3,"Fashion, Travel & Lifestyle",17,17
Focusj3,Music,6,6
Focusj3,Politics,1,1
Focusj3,Science,3,3
Focusj3,Sports,1,1
Focusj3,Tech,3,3
Princeolaoluwa,Business & CEOs,5,5
Princeolaoluwa,Entertainment,13,13


In [2]:
# most_influential_words(clf_model, count_vectorizer, category_index=2, num_words=200)

In [29]:
import sklearn
import pandas as p
import numpy as np
import scipy as sp
import pylab as pl
from sklearn import linear_model, cross_validation, metrics
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.metrics import zero_one_loss
from sklearn import preprocessing
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_selection import SelectKBest, chi2

modelType = "notext"

# ----------------------------------------------------------
# Prepare the Data
# ----------------------------------------------------------
training_data = np.array(p.read_table('F:/NYC/NYU/SM/3/SNLP/Project/Data/train.tsv'))
print ("Read Data\n")

# get the target variable and set it as Y so we can predict it
Y = training_data[:,-1]

print(Y)

# not all data is numerical, so we'll have to convert those fields
# fix "is_news":
training_data[:,17] = [0 if x == "?" else 1 for x in training_data[:,17]]

# fix -1 entries in hasDomainLink
training_data[:,14] = [0 if x =="-1" else x for x in training_data[:,10]]

# fix "news_front_page":
training_data[:,20] = [999 if x == "?" else x for x in training_data[:,20]]
training_data[:,20] = [1 if x == "1" else x for x in training_data[:,20]]
training_data[:,20] = [0 if x == "0" else x for x in training_data[:,20]]

# fix "alchemy category":
training_data[:,3] = [0 if x=="arts_entertainment" else x for x in training_data[:,3]]
training_data[:,3] = [1 if x=="business" else x for x in training_data[:,3]]
training_data[:,3] = [2 if x=="computer_internet" else x for x in training_data[:,3]]
training_data[:,3] = [3 if x=="culture_politics" else x for x in training_data[:,3]]
training_data[:,3] = [4 if x=="gaming" else x for x in training_data[:,3]]
training_data[:,3] = [5 if x=="health" else x for x in training_data[:,3]]
training_data[:,3] = [6 if x=="law_crime" else x for x in training_data[:,3]]
training_data[:,3] = [7 if x=="recreation" else x for x in training_data[:,3]]
training_data[:,3] = [8 if x=="religion" else x for x in training_data[:,3]]
training_data[:,3] = [9 if x=="science_technology" else x for x in training_data[:,3]]
training_data[:,3] = [10 if x=="sports" else x for x in training_data[:,3]]
training_data[:,3] = [11 if x=="unknown" else x for x in training_data[:,3]]
training_data[:,3] = [12 if x=="weather" else x for x in training_data[:,3]]
training_data[:,3] = [999 if x=="?" else x for x in training_data[:,3]]

print ("Corrected outliers data\n")

# ----------------------------------------------------------
# Models
# ----------------------------------------------------------
if modelType == "notext":
    print ("no text model\n")
    #ignore features which are useless
    X = training_data[:,list([3, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 19, 20, 22, 25])]
    scaler = preprocessing.StandardScaler()
    print("initialized scaler \n")
    scaler.fit(X,Y)
    print("fitted train data and labels\n")
    X = scaler.transform(X)
    print("Transformed train data\n")
    svc = SVC(kernel = "linear")
    print("Initialized SVM\n")
    rfecv = RFECV(estimator = svc, cv = 5, loss_func = zero_one_loss, verbose = 1)
    print("Initialized RFECV\n")
    rfecv.fit(X,Y)
    print("Fitted train data and label\n")
    rfecv.support_
    print ("Optimal Number of features : %d" % rfecv.n_features_)
    savetxt('rfecv.csv', rfecv.ranking_, delimiter=',', fmt='%f')



IOError: File F:/NYC/NYU/SM/3/SNLP/Project/Data/train.tsv does not exist