# Text Cleaning Pipeline
The goal of this file is to import the comments and news sources, join the data, and find the sentiment scores of both the source and the cleaned text before performing analysis.

In [1]:
from _functions import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Trevo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Trevo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Create Model Input
Complete this process through concatenating the comments and the video transcripts

In [2]:
parent_directory = Path().resolve().parent
data_directory = parent_directory / "Data"
comments_directory = data_directory / "video_comments"
transcripts_directory = data_directory / "video_transcripts"

In [3]:
DUDNUM = 999999999

reliability_scores = {"nbc_news": 42.78, "wsj": 48.87, "bbc_news": 44.73, "bloomberg" : 42.11,
                      "cnn" : 42.13, "sixty_minutes" : 34.19, "sky_news" : 42.24,
                       "wusa": 46.67, "dw_news" : DUDNUM, "forbes" : 41.06,
                        "fox_news" : 24.83, "podcast_cvt" : DUDNUM, "econ_explained": DUDNUM}

bias_scores = {"nbc_news": -5.61, "wsj": -0.27, "bbc_news" : -1.33, "bloomberg": -3.16, "cnn" : -6.18,
               "sixty_minutes" : -9.55, "sky_news" : -0.88, "wusa": -1.50, "dw_news" : DUDNUM,
               "forbes": -3.87, "fox_news": 18.50, "podcast_cvt" : DUDNUM, "econ_explained": DUDNUM}

bias_ratings = {"nbc_news": "Lean Left", "wsj": "Center", "bbc_news" : "Center", "bloomberg": "Lean Left", "cnn" : "Lean Left",
               "sixty_minutes" : "Lean Left", "sky_news" : "Lean Left", "wusa": -1.50, "dw_news" : DUDNUM,
               "forbes": "Center", "fox_news": "Right", "podcast_cvt" : DUDNUM, "econ_explained": DUDNUM}

In [4]:
transcript_df = pd.DataFrame(columns=['index', 'source', 'transcript'])

comments_df = pd.DataFrame(columns=['index', 'source', 'leaning', 'reliability_score', 'bias_score', 'vader_transcript', 'vader_comment', 'comment'])

# Iterate over each .txt file in the directory
for file_name in os.listdir(comments_directory):

    # Construct comments file path
    file_path = comments_directory / file_name

    # Remove ".csv" and concatenate "_transcript.txt" for file_name
    transcript_file_name = file_name.replace(".csv", "_transcript.txt")
    transcript_file_path = transcripts_directory / transcript_file_name

    # TODO : Read in the CSV file for the comments
    input_df = pd.read_csv(file_path, encoding="utf-8")
    input_df = input_df.dropna()

    # Drop 'Unnamed: 0' column if it exists, as not all csv pulls have the index
    if 'Unnamed: 0' in input_df.columns:
        input_df = input_df.drop(columns=['Unnamed: 0'])

    # TODO : Read in the transcript file and append score to dataframe
    # NOTE : Cannot make use of any data without a transcript
    if transcript_file_path.exists():
        with open(transcript_file_path, "r", encoding="utf-8") as file:
            transcript_text = file.read().strip()

        # DONE : Define unique identifiers for the data
        source_name = re.sub(r"[0-9]+\.csv", "", file_name)
        index = int(re.sub(r"\D", "", file_name))

        # TODO : Turn this into a function
        # NOTE : Cannot make use of a function withot having , 
        if reliability_scores[source_name] != DUDNUM:

            # TODO : Add the news source name to the dataframe
            input_df['index'] = index
            input_df["source"] = source_name
            input_df['leaning'] = bias_ratings[source_name]
            input_df["reliability_score"] = reliability_scores[source_name]
            input_df["bias_score"] = bias_scores[source_name]

            # TODO : Add the VADER sentiment score to the dataframe
            input_df["vader_transcript"] = analyze_sentiment_vader(transcript_text)
            input_df["vader_comment"] = input_df.apply(lambda row: analyze_sentiment_vader(row['comment']), axis=1)

            # DONE : Append the transcript as a row to the transcript dataframe
            np_array_entry = np.array([index, source_name, transcript_text])
            transcript_df.loc[len(transcript_df)] = np_array_entry


            # Drop the 0th index row
            input_df = input_df.drop(0)

            comments_df = pd.concat([comments_df, input_df])

transcript_df.head()

  comments_df = pd.concat([comments_df, input_df])


Unnamed: 0,index,source,transcript
0,1,bbc_news,We begin with the breaking news. A fresh trade...
1,2,bbc_news,There are growing fears of a major global trad...
2,3,bbc_news,I'm Helena Humphrey. Good to have you with us....
3,1,bloomberg,"Congratulations, Mr.\n\nWell, he's back and gu..."
4,3,bloomberg,We are continuing to get headlines out of the ...


In [5]:
comments_df.head()

Unnamed: 0,index,source,leaning,reliability_score,bias_score,vader_transcript,vader_comment,comment
1,1,bbc_news,Center,44.73,-1.33,-0.8748,0.3182,"You tariff me, I tariff you. It seems fair to ..."
2,1,bbc_news,Center,44.73,-1.33,-0.8748,0.0,Wonder how Trump administration is going to be...
3,1,bbc_news,Center,44.73,-1.33,-0.8748,-0.7131,"if Fentanyl enter US through Canada, why don't..."
4,1,bbc_news,Center,44.73,-1.33,-0.8748,0.296,"So trump said he’d tariff Canada, Canada threa..."
5,1,bbc_news,Center,44.73,-1.33,-0.8748,-0.6597,America needs to stop blaming other countries ...


In [6]:
comments_df.to_csv(get_relative_path() / "comments.csv", index=False)
transcript_df.to_csv(get_relative_path() / "transcripts.csv", index=False)