In [11]:
import pprint
import pathlib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans




In [30]:
def read_csv_data(df):
    """
    Reads two csv files (tweets and events) and 
    returns a pandas dataframe for each file
    """
    return pd.read_csv(df, header=0)

In [31]:
syria_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/syria_data/')
results_csv = syria_data_dir / 'model' / 'model_data' / 'tweet_modelvtest.csv'
results_df = read_csv_data(results_csv)
print(results_df.dtypes)
print(results_df.shape)

results_df = results_df.dropna(subset=['cosine_value'])
print(results_df.shape)

results_df = results_df[(results_df[['cosine_value']] != 0).all(axis=1)]
print(results_df.shape)

  if (yield from self.run_code(code, result)):


tweet_id                     object
tweet_created_at             object
tweet_text                   object
tweet_sentiment_compound    float64
tweet_sentiment_score       float64
tweet_sentiment_label        object
tweet_text_normalize         object
event_id                    float64
cosine_value                float64
event_date                   object
location                     object
event_text                   object
event_text_normalize         object
dtype: object
(638032, 13)
(106720, 13)
(68277, 13)


In [32]:
def remove_tweets_filter(df, filter_amount=None):
    """
    Drops Nan values from the tweet_text_normalize column and 
    returns dataframe filtered or whole
    """
    df = df.dropna(subset=['tweet_text_normalize'])
    if filter_amount:
        df = df[:filter_amount]

    return df

def eval_dataframe(df):
    """
    Evaluates dataframe column saved as string format 
    into a python list object
    """
    df['tweet_text_normalize'] = df['tweet_text_normalize'].apply(eval)
    df['event_text_normalize'] = df['event_text_normalize'].apply(eval)
    return df

def create_corpus(df):
    """
    Creates two corpus' for the algorithm. The tweet corpus
    will be the training set and the event corpus will be the query.
    """
    # column to list
    tweet_normalize_list = df.tweet_text_normalize.tolist()
    event_normalize_list = df.event_text_normalize.tolist()
    # join processed text together for corpus'
    tweet_corpus = [ ' '.join(x) for x in tweet_normalize_list ]
    event_corpus = [ ' '.join(x) for x in event_normalize_list ]

    return tweet_corpus, event_corpus


In [33]:
results_df = remove_tweets_filter(results_df)
results_df = eval_dataframe(results_df)
tweet_corp, event_corp = create_corpus(results_df)

print(results_df.shape)
print(len(tweet_corp), len(event_corp))

(68277, 13)
68277 68277


In [41]:
def tfidf_algo(tweet_corpus, print_shape=False):
    """
    returns a two document term matrices of document (y) and featured words (x)

    fit_transform(): Learn vocabulary and idf, return term-document matrix.
    transform(): Transform documents to document-term matrix. Uses the vocabulary and 
    document frequencies (df) learned by fit_transform()
    """
    vectorizer = TfidfVectorizer(min_df=.000025, max_features=10000)
    tweetVectorizerArray = vectorizer.fit_transform(tweet_corpus)
    print(vectorizer.get_feature_names())
    if print_shape:
        print('Tweets: {} Events: {}'.format(tweetVectorizerArray.shape, eventVectorizerArray.shape))
    return tweetVectorizerArray


In [42]:
tweetVectorizerArray= tfidf_algo(tweet_corp)
print(tweetVectorizerArray.get_feature_names())
# tweet_corp, event_corp
# silhouette_score(tweetVectorizerArray.toarray(), eventVectorizerArray.toarray(), sample_size=50)



AttributeError: get_feature_names not found

In [23]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(eventVectorizerArray)
data2D = pca.transform(eventVectorizerArray)  

In [25]:
for num_clusters in range(2,3):
    km = KMeans(n_clusters=num_clusters,
            n_init=10,                        # number of iterations with different seeds
            random_state=1                    # fixes the seed 
           )
    cluster_labels = km.fit_predict(data2D)
    print(cluster_labels)
    silhouette_avg = silhouette_score(data2D, cluster_labels)
    print(silhouette_avg)

[1 1 1 ... 0 1 1]


KeyboardInterrupt: 

In [None]:
for n_clusters in range(2, 10):
    km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
    cluster_labels = km.fit_predict(eventVectorizerArray)
    silhouette_avg = silhouette_score(eventVectorizerArray, cluster_labels, sample_size=1000)

    print(n_clusters, silhouette_avg)