# Connection, install, import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/MyDrive/'...Your path...'/

In [None]:
pip install wordninja

In [None]:
# used packages
import csv
import pandas as pd
import re
import pickle
import wordninja
from tqdm import tqdm
from tokenize import tokenize

from nltk.tokenize import word_tokenize
import nltk.classify.util
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# preprocessing functions

In [None]:
# words will be delete
stop_words = [
 'but',
 'only',
 'retweet']

# contractions decoding
CONTRACTION_MAP = {}
with open('dictionary/Contraction_Dict.p', 'rb') as fp:
    CONTRACTION_MAP = pickle.load(fp)

def convert_contraction_to_word(text):
    for cont in CONTRACTION_MAP:
        text = re.sub(r' '+cont+'\W', f' {" ".join(CONTRACTION_MAP[cont].split())} ', text)
    return text

#emoji decoding
emojis = {}
with open('dictionary/Emoji.pickle', 'rb') as fp:
    emojis = pickle.load(fp)

def convert_smiley_to_word(text):
    for emot in emojis:
        text = re.sub(r'('+emot+')', f' {" ".join(emojis[emot].split())}', text)
    return text

#emoticons decoding
emoticons = {}
with open('dictionary/emoticons.pickle', 'rb') as fp:
    emoticons = pickle.load(fp)

def convert_emoticons_to_word(text):
    for emot in emoticons:
        text = re.sub(r'('+emot+')', f' {" ".join(emoticons[emot].split())}', text)
    return text

# preprocessing tweets

In [None]:
# adobe apple ibm microsoft nvidia qualcomm salesforce servicenow

company_name = 'adobe'
path = f'tweets/{company_name}.csv'
df = pd.read_csv(path, lineterminator='\n') # read downloaded tweets
df = df.sort_values(by='date')
df = df.drop_duplicates(subset=['text', 'username']) #???
df = df.reset_index(drop=True)
df['text_format'] =''
df

In [None]:
# checking null text values
# df = df[df['text'].isnull().values == False]
df[df['text'].isnull().values == True]

In [None]:
# checking null date values
# df = df[df['date'].isnull().values == False]
df[df['date'].isnull().values == True]

In [None]:
num = df.shape[0]

for i in tqdm(range(0, num), ascii=True, desc='pre-processing text'):
  text = df.loc[i, 'text']
  text = text.lower() # lowcase
  text = re.sub(r"&amp|&gt|(?:https?\://)\S+|\@\S+", "", text) # delete: &amp, &gt, https://, @
  text = re.sub(r"\.{2,}", ". ", text) # delete ellipsis
  text = convert_contraction_to_word(text) # contractions decoding
  text = ' '.join([CONTRACTION_MAP.get(item, item) for item in text.split()])
  text = convert_emoticons_to_word(text)  #emoticons decoding
  text = convert_smiley_to_word(text) #emoji decoding
  text = " ".join([re.sub(r"\#\S+", " ".join(wordninja.split(w)), w)  for w in text.split()]) #hashtags decoding
  text = " ".join([w for w in text.split() if not w in stop_words]) # delete stop words
  text = re.sub("[^a-zA-Z0-9();:\".%?!/,'\- ]", "", text) # delete symbols except listed ones

  df.loc[i, 'text_format'] = text

  # if len(text.split())<3:
  #   less_three.append(i)

In [None]:
less_three = []
num = df.shape[0]
for i in tqdm(range(0, num), ascii=True, desc='less than 3words text'):
  if len(str(df.loc[i, 'text_format']).split())<3:
    less_three.append(i)

df = df.drop(labels = less_three ,axis = 0) # delete observations that less than 3 words in
df = df.reset_index(drop=True)
df

In [None]:
# saved
path = f'processed_tweets/{company_name}.csv'
df.to_csv(path, header=True, index=False)