Connect to the shared google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import necessary library and packages

In [None]:
import pandas as pd
import html
import re
!pip install geopy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm # Use a linear svm model
from sklearn.metrics import classification_report
# Use gridsearch to hypertune parameters for linear svm
from sklearn.model_selection import GridSearchCV
import pickle
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
tk = SyllableTokenizer()
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
tqdm.pandas()


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Read the raw data scraped from Twitter

In [None]:
# read the data

barbera = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Barbera.csv')
cabernet = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Cabernet.csv')
chardonnay = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Chardonnay.csv')
ciu_ciu_1 = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Ciu Ciu.csv')
ciu_ciu_2 = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Ciù Ciù.csv')
ciu_ciu_3 = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/CiuCiu.csv')
ciu_ciu_4 = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/CiùCiù.csv')
merlot = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Merlot.csv',lineterminator='\n')
montepulciano = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Montepulciano.csv', lineterminator='\n')
moscato = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Moscato.csv', lineterminator='\n')
pinot_grigio = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Pinot Grigio.csv', lineterminator='\n')
sangiovese = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Sangiovese.csv', lineterminator='\n')
sauvignon = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Sauvignon.csv', lineterminator='\n')
syrah = pd.read_csv('/content/drive/Shareddrives/Big data/data_v2/Syrah.csv', lineterminator='\n')


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Combine the 4 separate data files for Ciu Ciu into one

In [None]:
ciu_ciu = pd.concat([ciu_ciu_1,ciu_ciu_2,ciu_ciu_3,ciu_ciu_4])

Remove unwanted columns from each data set

In [None]:
# remove unwanted columns

cols = ['replyCount', 'retweetCount', 'likeCount', 'quoteCount']
df = [barbera, cabernet, chardonnay, ciu_ciu, merlot, montepulciano, moscato, pinot_grigio, sangiovese, sauvignon, syrah ]
for i in df:
  i.drop(cols,axis=1,inplace=True)


Define the cleaning function for the location data from the twitter data sets

In [None]:
 # cleaning location data

def location_cleaner(df):
  df = df[~df['user_location'].str.contains('#',na=False)] #remove rows containing hashtags

  loc_nan_values = df[df['user_location'].isna()]
  df = df.drop(loc_nan_values.index) #identify and drop na values from location column

  # removing unknown words from user_location data
  df['extracted_user_loc'] = df['user_location'].astype(str).apply(lambda x: list(nlp(x).ents) if len(list(nlp(x).ents))>0 else np.nan)
    
  # identify and dropp na values from extracted user location column
  nan_values = df[df['extracted_user_loc'].isna()]
  df = df.drop(nan_values.index)

  # sample 1/1000 data rows for geolocation
  number_samples = int(len(df)/1000) + 2
  df_sample = df.sample(n=number_samples,random_state=3)

  # geocode the sampled data
  geolocator = Nominatim(user_agent="my-application")
  geocode = RateLimiter(geolocator.geocode, min_delay_seconds=3,max_retries=3)
  df_sample['location'] = df_sample['user_location'].progress_apply(geocode)

  # remove any rows that contain na in the new geocoded location
  df_nan_values = df_sample[df_sample['location'].isna()]
  df_sample = df_sample.drop(df_nan_values.index)

  return df_sample


Clean & preprocess the location data & save each cleaned location file as a .csv to the google shared drive

In [None]:
syrah1 = location_cleaner(syrah)
syrah1.to_csv('syrah_location.csv')
!cp syrah_location.csv "drive/My Drive/"

100%|██████████| 45/45 [02:27<00:00,  3.28s/it]


Define the preprocessing function for the text data from Twitter

In [None]:
# preprocess the text data

def flatten(l):
    return [item for sublist in l for item in sublist]

def preprocessor(df):

  df["text"] = df["text"].astype(str)

  for i in range (len(df)):
    x = df['text'][i].replace('\n',' ') #cleaning newline “\n” from the tweets
    df['text'][i] = html.unescape(x)

  for i in range (len(df)):
    df['text'][i] = re.sub(r'<br /><br />|(@[A-Za-z0–9_]+)|(#[A-Za-z0–9_]+)|[^\w\s]|http\S+', ' ', df['text'][i]) # add removal items for <br /><br /> and #something

  df['tweets_to_token'] = df['text']
  sw = stopwords.words('english') #you can adjust the language as you desire
  sw.remove('not') #we exclude not from the stopwords corpus since removing not from the text will change the context of the text

  for i in range(len(df['tweets_to_token'])):
    df['tweets_to_token'][i] = word_tokenize(df['tweets_to_token'][i]) # do the word tokenize

  for token in df['tweets_to_token'][i]:
    df['tweets_to_token'][i] = tk.tokenize(token)
    flatten(df['tweets_to_token'][i])

  for i in range(len(df['tweets_to_token'])):
    df['tweets_to_token'][i] = ' '.join([word for word in df['tweets_to_token'][i] if not word in sw]) # turn the tokenized listf into string to fit the format for applying CountVectorizer()
  
  return df

Clean & preprocess the text data & save each cleaned data as a .csv file to the google shared drive

In [None]:
syrah_token = preprocessor(syrah)
syrah_token.to_csv('syrah_token.csv')
!cp syrah_token.csv "drive/Shareddrives/Big data/data_v2/" 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
A value is trying to be set on 