In [1]:
import os
while  not 'juxtorpus' in os.listdir():
    os.chdir('../')
assert 'juxtorpus' in os.listdir(), f"Working directory should be at juxtorpus. But at {os.getcwd()}"
f"Working directory: {os.getcwd()}"

'Working directory: /Users/hcha9747/workspace/juxtorpus'

# Intro to Juxtorpus
Juxtorpus is a project that is designed for you to do corpus analysis work in an easy and seamless manner.


The main concept is to have a 'Corpus' unit that you can explore by yourself and pass it along to other corpus analysis tools that are being developed.
A major feature of the Corpus unit is the ability to slice them freely depending on your meta data.

Juxtorpus is one of those tools that you can pass a Corpus unit into to *juxtapose* two corpus.

### This Workshop:
1. Create a Corpus using the CorpusBuilder
2. Create a Corpus from a dataframe
3. Slicing the Corpus.
4. Referencing parent Corpus from subcorpus.
5. Adding the corpus into a Corpora.

### 1. Creating a Corpus using CorpusBuilder

In [2]:
from juxtorpus.corpus import Corpus, CorpusBuilder

path = './notebooks/demos/Sample_Auspol_Tweets.xlsx'
builder = CorpusBuilder(path)
builder.summary()   # Let's look at the summary of the builder

All Columns,created_at,from_user_name,id,lang,location,possibly_sensitive,retweet_count,source,text,tweet_type
Text,,,,,,,,,,
Meta,,,,,,,,,,
Dtype,,,,,,,,,,


In [3]:
builder.head()    # Let's do a preview of the data

Unnamed: 0,id,created_at,from_user_name,source,location,text,retweet_count,lang,possibly_sensitive,tweet_type
0,1403753011973956096,2021-06-12 16:36:13,mydirtyhotel,,"Toronto, CANADA",The latest My Dirty Hotel! https://t.co/9EpRbf...,0,en,0,Original
1,1498336572021772032,2022-02-28 16:37:31,sapienbloom,Twitter for Android,,RT @SaintFrankly: Good use of @ScottMorrisonMP...,0,en,0,Retweet
2,1481468339885342976,2022-01-13 03:29:11,shayne_chester,Twitter Web App,potts point,RT @cas_bar000: #novaxdjokovic #Asylum #HomeTo...,0,und,1,Retweet


In [4]:
# Now let's add all the meta data we want in our corpus.
builder.add_metas('created_at', dtypes='datetime')
builder.add_metas(['lang', 'source', 'tweet_type'], dtypes='category')
builder.add_metas(['from_user_name', 'retweet_count'], dtypes=['str', 'int'], lazy=False)
builder.add_metas('id')  # i.e. dtype is automatic
builder.add_metas('location', dtypes=None)  # i.e. dtype is automatic
builder.summary()

All Columns,created_at,from_user_name,id,lang,location,possibly_sensitive,retweet_count,source,text,tweet_type
Text,,,,,,,,,,
Meta,✅,✅,✅,✅,✅,,✅,✅,,✅
Dtype,datetime,str,inferred,category,inferred,,int,category,,category


In [5]:
builder.set_document_column('text') # set the document column
corpus = builder.build()

In [6]:
corpus.name   # a corpus name is randomly generated. You may also use builder.set_name() to define your own.

'vain-sphere'

In [7]:
corpus.summary()

Unnamed: 0,Unnamed: 1
Corpus Type,Corpus
Number of Documents,50000
Number of Total Words,970725
Size of Vocabulary,52248
mean Words per Document,19
min Words per Document,1
max Words per Document,68
metas,"created_at, lang, source, tweet_type, id, loca..."


In [8]:
corpus.meta  # this can take a bit longer as series are loaded lazily on default (otherwise set lazy=False in add_metas)

{'created_at': <SeriesMeta [Id: created_at dtype: datetime64[ns]]>,
 'lang': <SeriesMeta [Id: lang dtype: category]>,
 'source': <SeriesMeta [Id: source dtype: category]>,
 'tweet_type': <SeriesMeta [Id: tweet_type dtype: category]>,
 'id': <SeriesMeta [Id: id dtype: int64]>,
 'location': <SeriesMeta [Id: location dtype: object]>,
 'from_user_name': <SeriesMeta [Id: from_user_name dtype: object]>,
 'retweet_count': <SeriesMeta [Id: retweet_count dtype: int64]>}

### 2. Creating a Corpus from dataframe

In [9]:
import pandas as pd
corpus_df = Corpus.from_dataframe(pd.read_excel(path), col_doc='text')
corpus_df.summary()

Unnamed: 0,Unnamed: 1
Corpus Type,Corpus
Number of Documents,50000
Number of Total Words,970725
Size of Vocabulary,52248
mean Words per Document,19
min Words per Document,1
max Words per Document,68
metas,"id, created_at, from_user_name, source, locati..."


In [10]:
corpus.meta   # you'll have to define the datatypes when building the dataframe yourself.

{'created_at': <SeriesMeta [Id: created_at dtype: datetime64[ns]]>,
 'lang': <SeriesMeta [Id: lang dtype: category]>,
 'source': <SeriesMeta [Id: source dtype: category]>,
 'tweet_type': <SeriesMeta [Id: tweet_type dtype: category]>,
 'id': <SeriesMeta [Id: id dtype: int64]>,
 'location': <SeriesMeta [Id: location dtype: object]>,
 'from_user_name': <SeriesMeta [Id: from_user_name dtype: object]>,
 'retweet_count': <SeriesMeta [Id: retweet_count dtype: int64]>}

### 3. Slicing a Corpus

In [11]:
# Let's look at the unique values of the meta 'source'.
corpus.meta.get('source').series.unique()[:3]

[NaN, 'Twitter for Android', 'Twitter Web App']
Categories (122, object): [' rohingya Update', '101rafiki', 'Acast Podcasts', 'AdBlueNews', ..., 'twitter app marcel', 'world_news_eng', 'xael bot', 'Оwly']

In [12]:
android = corpus.slicer.filter_by_item('source', 'Twitter for Android')
android.summary()

Unnamed: 0,Unnamed: 1
Corpus Type,Corpus
Number of Documents,12734
Number of Total Words,244213
Size of Vocabulary,22327
mean Words per Document,19
min Words per Document,1
max Words per Document,60
metas,"created_at, lang, source, tweet_type, id, loca..."


In [13]:
# Let's look at the value counts of the meta 'retweet_count'
android.meta.get('retweet_count').series.value_counts().iloc[:5]

0    12356
1      130
2       62
3       40
4       21
Name: retweet_count, dtype: int64

In [14]:
android_at_least_10_retweets = android.slicer.filter_by_range('retweet_count', min_=10)
android_at_least_10_retweets.summary()

Unnamed: 0,Unnamed: 1
Corpus Type,Corpus
Number of Documents,94
Number of Total Words,1959
Size of Vocabulary,908
mean Words per Document,20
min Words per Document,5
max Words per Document,54
metas,"created_at, lang, source, tweet_type, id, loca..."


In [15]:
# do a groupby of the corpus on the datetime meta 'created_at'
groups = android_at_least_10_retweets.slicer.group_by('created_at', grouper=pd.Grouper(freq='1W'))
type(groups) # groups is a generator (follows pandas convention)

generator

In [16]:
groups = list(groups)
len(groups), groups[0]

(54,
 (Timestamp('2021-03-07 00:00:00', freq='W-SUN'),
  <juxtorpus.corpus.corpus.Corpus at 0x2ae8dba00>))

### 4. Referencing parent corpus

In [17]:
subcorpus = groups[0][1]
subcorpus.summary()

Unnamed: 0,Unnamed: 1
Corpus Type,Corpus
Number of Documents,1
Number of Total Words,14
Size of Vocabulary,14
mean Words per Document,14
min Words per Document,14
max Words per Document,14
metas,"created_at, lang, source, tweet_type, id, loca..."


In [18]:
subcorpus.parent.name

'bitter-lattice'

In [19]:
# subcorpus names are generated, you can rename them.
subcorpus.name = 'custom name'
subcorpus.name

'custom name'

In [21]:
# you can also find the root corpus from the subcorpus.
subcorpus.find_root().name, corpus.name

('vain-sphere', 'vain-sphere')

### 5. Let's put them in a Corpora

In [22]:
from juxtorpus.corpus import Corpora

corpora = Corpora([corpus])
for gid, subcorpus in groups:
    corpora.add(subcorpus)
    
corpora.render()

HTML(value='<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>Co…