In [None]:
#Importing and starting the TPU
import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)

In [None]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.11.0


In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from imblearn.over_sampling import SMOTE  # Import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Connecting to Google Drive to get the dataset
from google.colab import drive
drive.mount('/content/drive')

#Loading the dataset from Google Drive
df = pd.read_csv('/content/drive/MyDrive/yuksekTez/airline_dataset/Tweets.csv')

Mounted at /content/drive


In [None]:
#Cleaining the text data
def clean_text(text):
  #Removing the URL's
  text = re.sub(r'http\S+', '', text)
  # remove mentions
  text = re.sub(r'@\w+', '', text)
  # remove hashtags (Only the Hastag sign not the text itself)
  processed_text = re.sub(r'#(\w+)', r'\1', text)
  # remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # remove numbers
  text = re.sub(r'\d+', '', text)
  # remove stopwords
  text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
  return text.lower()
  df.head()

In [None]:
#Checking the tail of the dataset

#Appying clean_text function to the dataset
with strategy.scope():
  df['clean_text1'] = df['text'].apply(lambda x: clean_text(x))

In [None]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,clean_text1
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),said
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),plus youve added commercials experience tacky
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),didnt today must mean need take another trip
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),really aggressive blast obnoxious entertainmen...
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),really big bad thing


In [None]:
# Select and isolate some columns
selected_columns = ["text", "airline_sentiment", "clean_text1"]
df_selected = df[selected_columns]

# Print the selected dataframe
print(df_selected)

                                                    text airline_sentiment  \
0                    @VirginAmerica What @dhepburn said.           neutral   
1      @VirginAmerica plus you've added commercials t...          positive   
2      @VirginAmerica I didn't today... Must mean I n...           neutral   
3      @VirginAmerica it's really aggressive to blast...          negative   
4      @VirginAmerica and it's a really big bad thing...          negative   
...                                                  ...               ...   
14635  @AmericanAir thank you we got on a different f...          positive   
14636  @AmericanAir leaving over 20 minutes Late Flig...          negative   
14637  @AmericanAir Please bring American Airlines to...           neutral   
14638  @AmericanAir you have my money, you change my ...          negative   
14639  @AmericanAir we have 8 ppl so we need 2 know h...           neutral   

                                             clean_text1  
0   

In [None]:
df_selected.tail()

Unnamed: 0,text,airline_sentiment,clean_text1
14635,@AmericanAir thank you we got on a different f...,positive,thank got different flight chicago
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative,leaving minutes late flight warnings communica...
14637,@AmericanAir Please bring American Airlines to...,neutral,please bring american airlines blackberry
14638,"@AmericanAir you have my money, you change my ...",negative,money change flight dont answer phones suggest...
14639,@AmericanAir we have 8 ppl so we need 2 know h...,neutral,ppl need know many seats next flight plz put u...


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Tokenize and pad sequences within the strategy scope
with strategy.scope():
    tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
    tokenizer.fit_on_texts(df['clean_text1'])
    vocab_size = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences(df['clean_text1'])
    maxlen = 100
    padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding="post", truncating="post")
    print("Maximum Sequence Length:", maxlen)
    print("Vocab Size: ", vocab_size)

Maximum Sequence Length: 100
Vocab Size:  12874


In [None]:
# Check the number of items in a column
num_items = len(df_selected["clean_text1"])
print("Number of items in clean_text1:", num_items)

# Check the separation of item values in a column
unique_values = df_selected["airline_sentiment"].unique()
print("Unique values in airline_sentiment:", unique_values)

Number of items in clean_text1: 14640
Unique values in airline_sentiment: ['neutral' 'positive' 'negative']


In [None]:
# Get the number of NaNs in a column
num_nans = df_selected["clean_text1"].isna().sum()
print(num_nans)

0


In [None]:
item_counts = df_selected['airline_sentiment'].value_counts()
print(item_counts)

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64


In [None]:
# Drop rows with NaN values in the 'clean_text1' column
df_selected = df_selected.dropna(subset=['clean_text1'])

# Reset the index after removing rows
df_selected.reset_index(drop=True, inplace=True)

# Check if NaN rows are removed
print("Number of items after removing NaN rows:", len(df_selected))


Number of items after removing NaN rows: 14640


In [None]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text1'])

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, df['airline_sentiment'].values)

# Save the preprocessed and balanced features (X_resampled) to a CSV file
df_resampled_features = pd.DataFrame(data=X_resampled.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_resampled_features.to_csv('/content/drive/MyDrive/yuksekTez/airline_dataset/preprocessed_balanced_features.csv', index=False)

# Save the balanced labels (y_resampled) to a CSV file
df_resampled_labels = pd.DataFrame({'airline_sentiment': y_resampled})
df_resampled_labels.to_csv('/content/drive/MyDrive/yuksekTez/airline_dataset/preprocessed_balanced_labels.csv', index=False)

In [None]:
item_counts = df_resampled_labels['airline_sentiment'].value_counts()
print(item_counts)

neutral     9178
positive    9178
negative    9178
Name: airline_sentiment, dtype: int64


In [None]:
#Saving the cleaned text column to a CSV file
df_selected.to_csv('/content/drive/MyDrive/yuksekTez/airline_dataset/cleaned_text1.csv', index=False)

In [None]:
# Create a DataFrame with both features and labels
columns = tfidf_vectorizer.get_feature_names_out()
df_resampled = pd.DataFrame(data=X_resampled.toarray(), columns=columns)
df_resampled['airline_sentiment'] = y_resampled

# Save the combined dataset to a CSV file
df_resampled.to_csv('/content/drive/MyDrive/yuksekTez/airline_dataset/preprocessed_balanced_dataset.csv', index=False)