In [1]:
# Set the environment
import pandas as pd
import numpy as np
import nltk


In [27]:
# Load the data
df = pd.read_csv(r'/home/brian/Documents/tweetf/tweets.csv')

# Make the columns viewable
pd.set_option('display.max_colwidth', None)

df.head()


Unnamed: 0,text,label
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1


In [None]:
# Count the no of each label to check on balance
df['label'].value_counts()


In [None]:
# Take first text entry
sample = df.text[0]
sample


In [None]:
# Import a tokenizer
from nltk import word_tokenize

# Tokenize the sample
sample_tokens = word_tokenize(sample)

# Return the tokens
sample_tokens


In [None]:
# Import the package
from nltk import bigrams

# Create the bigrams
sample_bitokens = list(bigrams(sample_tokens))

# Return the bigrams
sample_bitokens


In [None]:
# Check the frequency of occurrence of tokens
from nltk import FreqDist

# Create a frequency distro for the tokens
sample_distro = FreqDist(sample_tokens)

# Return the top 10
sample_distro.most_common(10)


In [8]:
# Create a function to do all this
def token_analyze(text, n):

    # Create tokens
    uni_tokens = word_tokenize(text)

    # Create a freq distro
    distro_freq = FreqDist(uni_tokens)

    # Return the n metric
    return distro_freq.most_common(n)


token_analyze(df.text[9], 5)


[('the', 2), ('Loved', 1), ('casting', 1), ('of', 1), ('Jimmy', 1)]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer


def create_dtm(series):

    # Create the instance of a class
    cv = CountVectorizer()

    # Create a DTM from the provided series
    dtm = cv.fit_transform(series)

    # Convert the sparse array into a dense array
    dtm = dtm.todense()

    # Get column names
    features = cv.get_feature_names_out()

    # Create a dataframe
    dtm_df = pd.DataFrame(dtm, columns=features)

    return dtm_df


# Try it out
create_dtm(df.text.head())


Unnamed: 0,about,acting,aimless,almost,and,angles,anything,artiness,as,attempting,...,trying,very,walked,was,when,white,who,whom,with,young
0,1,0,1,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
2,0,1,0,1,3,1,0,1,1,1,...,0,0,0,1,0,1,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


In [13]:
# Import logistic regression
from sklearn.linear_model import LogisticRegression


def top_tokens(text, sentiment, n):
    # Create an instance of the class
    lgr = LogisticRegression(solver='lbfgs', max_iter=2500, random_state=1234)
    cv = CountVectorizer()

    # create the DTM
    dtm = cv.fit_transform(text)

    # Fit the logistic regression model
    lgr.fit(dtm, sentiment)

    # Get the coefficients
    coefs = lgr.coef_[0]

    # Create the features / column names
    features = cv.get_feature_names_out()

    # create the dataframe
    df = pd.DataFrame({'Tokens': features, 'Coefficients': coefs})

    # Return the largest n
    return df.nlargest(n, 'Coefficients')


# Test it on the df['text']
top_tokens(df.text, df.label, 10)


Unnamed: 0,Tokens,Coefficients
1567,liked,1.286747
2997,wonderful,1.242158
1104,funny,1.112821
1182,great,1.068772
2949,well,1.043139
246,beautiful,1.042833
0,10,1.035405
344,brilliant,1.01408
908,excellent,1.009914
2203,right,0.985806


In [None]:
# -------------TEXTBLOB----------------
# here, the data isn't labelled. The algo determines the sentiment
from textblob import TextBlob


def polarity_subjectivity(text, print_results=False):
    # Create an instance of TextBlob
    tb = TextBlob(text)

    # If the condition is met, print results, otherwise, return the tuple
    if print_results:
        print("Polarity is: ", round(
            tb.sentiment[0], 2), "and Subjectivity is: ", round(tb.sentiment[1], 2))
    else:
        return(tb.sentiment[0], tb.sentiment[1])


# Test
polarity_subjectivity(sample, True)


Polarity is:  0.18 and Subjectivity is:  0.4


In [None]:
# ------------COUNTING THE NUMBER OF TOKENS------------------

from nltk import word_tokenize

# A function that counts the no. of tokens in a string


def token_count(str):
    return len(word_tokenize(str))

# A function that counts tokens in a Pandas Series


def stoken_count(series):
    return series.apply(token_count)


# test
stoken_count(df.text.head(10))


0    18
1    21
2    33
3     9
4    22
5    27
6     4
7    17
8     4
9    11
Name: text, dtype: int64

In [None]:
# A function that applies polarity _subjectivity to a column
def series_pol_subj(series):
    return series.apply(polarity_subjectivity)


# Apply to the top 10 rows of our dataframe
series_pol_subj(df['text'].head(10))


0                                 (0.18, 0.395)
1    (0.014583333333333337, 0.4201388888888889)
2    (-0.12291666666666666, 0.5145833333333333)
3                  (-0.24375000000000002, 0.65)
4                                    (1.0, 0.3)
5                                   (-0.1, 0.5)
6                                   (-0.2, 0.0)
7                     (0.7, 0.6000000000000001)
8                                   (-0.2, 0.5)
9                                    (0.7, 0.8)
Name: text, dtype: object

In [None]:
# -------------MEASURE OF COMPLEXITY-LEXICAL DIVERSITY----------------

def complexity(str):
    # Create a list of all tokens
    all_tokens = word_tokenize(str)

    # Create a set of unique tokens
    unique_tokens = set(word_tokenize(str))

    # Return the complexity measure
    return len(unique_tokens) / len(all_tokens)


# test
df.text.head(10).apply(complexity)


0    0.722222
1    0.952381
2    0.848485
3    1.000000
4    1.000000
5    0.814815
6    1.000000
7    0.941176
8    1.000000
9    0.909091
Name: text, dtype: float64

In [11]:
# -----------------TEXT CLEANUP----------------

from nltk.corpus import stopwords


def stopword_remover(str):
    # Create tokens
    tokens = word_tokenize(str)

    # Identify stopwords
    eng_stopwords = stopwords.words('english')

    # Return non-stopwords
    return [w for w in tokens if w.lower() not in eng_stopwords]


# test
df.text.head(5).apply(stopword_remover)


0                                                                                                 [,, ,, slow-moving, ,, aimless, movie, distressed, ,, drifting, young, man, .]
1                                                                                                        [sure, lost, -, flat, characters, audience, ,, nearly, half, walked, .]
2    [Attempting, artiness, black, &, white, clever, camera, angles, ,, movie, disappointed, -, became, even, ridiculous, -, acting, poor, plot, lines, almost, non-existent, .]
3                                                                                                                                            [little, music, anything, speak, .]
4                                                                                                     [best, scene, movie, Gerardo, trying, find, song, keeps, running, head, .]
Name: text, dtype: object

In [14]:
# ----------REMOVING PUNCTUATION MARKS-----------------

def nonalpha_remover(str):
    return [x for x in stopword_remover(str) if x.isalpha()]


df['text'].head().apply(nonalpha_remover)


0                                                                                                [aimless, movie, distressed, drifting, young, man]
1                                                                                    [sure, lost, flat, characters, audience, nearly, half, walked]
2    [Attempting, artiness, black, white, clever, camera, angles, movie, disappointed, became, even, ridiculous, acting, poor, plot, lines, almost]
3                                                                                                                  [little, music, anything, speak]
4                                                                           [best, scene, movie, Gerardo, trying, find, song, keeps, running, head]
Name: text, dtype: object

In [26]:
# -----------CLEANING COMPLEXITY------------
# removing punctuation and stopwords

def complexity_cleaned(series):
    return series.apply(lambda x: complexity(' '.join(nonalpha_remover(x))))


# Add 'complexity' column to the dataframe
df['complexity'] = complexity_cleaned(df.text)

# Return top 10 highest complexity scores
df.sort_values(['complexity'], ascending=False).head(10)


NameError: name 'complexity' is not defined