In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# replace with full dataset if desired
df = pd.read_csv('sample_data.csv')
print(df.columns)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'sample_data.csv'

## NOTES FROM PARTER MEETING 9/10/22
________________________________________________________________
#### use SpaCy to grab nouns/verbs, use their tagging functions to grab instances of people, geolocations, etc.
#### after getting this data, create features/attributes based on the data and join to the individual sources. Cluster
#### on datasets aggregated by sources and by country (and anything else we think of)

#### cluster word vectors (maps to dimensions and takes euclidean or other distances)
#### overall, try a lot of clustering

#### if we want to dive even deeper into NLP nuances, prof uploaded NLP video on onedrive (use second link not first)

#### characterizing the sources themselves by quality or anything else could be good idea
#### , (whats a good source for information x), who reports factual, who uses adjectives, etc.

In [None]:
cols = ['source', 'source_scale', 'notes']
sources = df[cols]

# some combinations of sources are in different orders. Creating source_cleaned to fix this
sources['source_cleaned'] = [str(sorted(s.split('; '))) for s in sources['source']]

sources.head(10)

In [None]:
fig, ax = plt.subplots(2, figsize=(10, 10))
palette = 'viridis'

sns.countplot(ax=ax[0],palette=palette, y=sources['source_scale'], order=pd.value_counts(sources['source_scale']).index)
sns.countplot(ax=ax[1],palette=palette, y=sources['source_scale'], order=pd.value_counts(sources['source_scale'])[:10].index)

#### It looks like the majority of entries are on a national scale.

In [None]:
# Since sources that appear to report the same event are separated by a semicolon, we have to expand this column to get the
# count of times a source has reported on an event
sources_list = list()
for s in sources['source']:
    for i in s.split('; '):
        sources_list.append(i)

In [None]:
# putting into Pandas df to get counts
source_expanded = pd.DataFrame({'source': sources_list})
source_expanded.groupby('source')['source'].size().reset_index(name='count').sort_values('count', axis=0, ascending=False)

In [None]:
fig, ax = plt.subplots(2, figsize=(10, 10))
palette = 'viridis'

# Plot of sources
sns.countplot(ax=ax[0], palette=palette, y=sources['source_cleaned'], order=pd.value_counts(sources['source_cleaned']).iloc[:20].index)

# Plot of individually counted sources
sns.countplot(ax=ax[1], palette=palette, y=source_expanded['source'], order=pd.value_counts(source_expanded['source']).iloc[:10].index)

### Yemen Data Project appears very common in the individually counted sources, but raw sources only show the source paired with another source.

### Exploring these below:

In [None]:
yemen = sources[sources['source_cleaned'].str.contains('Yemen Data Project')]
sns.countplot(y=yemen['source_cleaned'], order=pd.value_counts(yemen['source_cleaned']).iloc[:20].index)

# NLP on 'notes' column

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

from wordcloud import WordCloud


# tokenizer that removes punctuation
tokenizer = RegexpTokenizer(r'\w+')

lemmatizer = WordNetLemmatizer()

# creating one big string of all of the values in the 'notes' column
text = ''
for n in sources['notes']:
    text += n + ' '
    
tokens = tokenizer.tokenize(text)

# lemmatizing words
lem_words = [lemmatizer.lemmatize(w) for w in tokens]

# getting frequency distribution
dist_lem = FreqDist(lem_words)

In [None]:
# plotting most common words
top_common = dist_lem.most_common(15)
pdser = pd.Series(dict(top_common))

fig, ax = plt.subplots(figsize=(12,10))
all_plot = sns.barplot(x=pdser.index, y=pdser.values)

#### Looks like there were some stopwords. Removing those and trying again.

In [None]:
# removing stopwords and replotting

stop_words = set(stopwords.words('english'))
tokens_cleaned = [w for w in tokens if w.lower() not in stop_words]

lem_words_cleaned = [lemmatizer.lemmatize(w) for w in tokens_cleaned]
dist_lem_cleaned = FreqDist(lem_words_cleaned)

# plotting most common words
top_common = dist_lem_cleaned.most_common(15)
pdser = pd.Series(dict(top_common))

fig, ax = plt.subplots(figsize=(12,10))
all_plot = sns.barplot(x=pdser.index, y=pdser.values)

In [None]:
wc = WordCloud().generate_from_frequencies(dist_lem_cleaned)
plt.imshow(wc, interpolation='bilinear')

In [None]:
# applying nlp() wrapper to obtain SpaCy attributes
mytext = nlp(text)

# counting named entities
labels = [x.label_ for x in mytext.ents]
counts = Counter(labels)
counts

# Creating DataFrame of joined individual sources

In [None]:
# First, we need a list of distinct sources
sources_distinct = list(set(sources_list))
sources_distinct_df = pd.DataFrame({'source_singular': sources_distinct})

# since eventually we're using a LIKE clause for the join, we need to add percentage wildcards here because we can't in pandasql.
sources_distinct_df['source_singular'] = sources_distinct_df['source_singular'].apply(lambda x: f'%{x}%')

# Second, join this df with the sources df with the help of pandasql

from pandasql import sqldf 
sql = lambda q: sqldf(q, globals())

expanded_source_df = sql('''
    SELECT * FROM df s
    JOIN sources_distinct_df sd
    ON s.source LIKE sd.source_singular
''')

# removing percentage wildcards
expanded_source_df['source_singular'] = expanded_source_df['source_singular'].apply(lambda x: x.replace('%', ''))
expanded_source_df.head()

#### Now, we can analyze singular sources based on other columns of interest.