<a href="https://colab.research.google.com/github/Yhola/Discourse-Analysis-Tool/blob/main/DiscourseAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries in Google Colab
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install nltk
!pip install python-docx
!pip install google-api-python-client
!pip install google-auth-httplib2
!pip install google-auth-oauthlib

# Import required libraries
import spacy
import nltk
import docx
import re
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from collections import Counter
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Download required nltk resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Authenticate and create the PyDrive client
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Function to download files from Google Drive
def download_file(file_name, file_id):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)
    with open(file_name, 'wb') as f:
        f.write(fh.read())

# File IDs for "Written Discourse" and "Spoken Discourse"
written_discourse_id = '019cJYlBCSj8000WeXKYc2XuVTUdnbhy73BcwRvGD'
spoken_discourse_id = '34BUR457bDs9ZlY6tkJZVBybfcsfg_IF_cE2m'

# Download the files
download_file('Written_Discourse.docx', written_discourse_id)
download_file('Spoken_Discourse.docx', spoken_discourse_id)

# Function to read Word files
def read_word_file(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Read the discourse files
written_text = read_word_file('Written_Discourse.docx')
spoken_text = read_word_file('Spoken_Discourse.docx')

# Function to tokenize text and remove stopwords
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum() or '-' in word]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Preprocess the texts
written_tokens = preprocess_text(written_text)
spoken_tokens = preprocess_text(spoken_text)

# Function to calculate average sentence length
def average_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    return total_words / len(sentences) if len(sentences) > 0 else 0

# Function to count passive voice constructions using spaCy
def count_passive_voice(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    count = 0
    for sentence in doc.sents:
        for token in sentence:
            if token.dep_ == 'auxpass' or (token.tag_ == 'VBN' and token.head.dep_ == 'auxpass'):
                count += 1
                break
    return count

# Function to calculate type-token ratio (TTR)
def type_token_ratio(tokens):
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

# Function to calculate average word length
def average_word_length(tokens):
    return sum(len(word) for word in tokens) / len(tokens) if len(tokens) > 0 else 0

# Function to count questions
def count_questions(text):
    return text.count('?')

# Function to count imperative verbs
def count_imperatives(tokens):
    imperative_verbs = [word for word, tag in pos_tag(tokens) if tag == 'VB']
    return len(imperative_verbs)

# Function to count discourse markers
def count_discourse_markers(tokens):
    discourse_markers = ["well", "like", "you know", "so", "actually", "basically", "I mean", "okay", "right", "kind of"]
    return sum(1 for token in tokens if token in discourse_markers) / (len(tokens) / 100) if len(tokens) > 0 else 0

# Function to count formal vocabulary
def count_formal_vocabulary(tokens):
    formal_terms = ["hence", "thus", "therefore", "furthermore", "moreover", "consequently", "accordingly", "nevertheless", "in addition", "on the other hand", "as a result", "whereas", "in contrast", "nonetheless", "despite", "subsequently"]
    return sum(1 for token in tokens if token in formal_terms) / (len(tokens) / 100) if len(tokens) > 0 else 0

# Analyzing both texts
analysis_results = {
    'Grammatical Complexity': {
        'Passive Voice (Written)': count_passive_voice(written_text),
        'Passive Voice (Spoken)': count_passive_voice(spoken_text),
        'Sentence Length (Written)': average_sentence_length(written_text),
        'Sentence Length (Spoken)': average_sentence_length(spoken_text)
    },
    'Lexical Diversity': {
        'Type-Token Ratio (Written)': type_token_ratio(written_tokens),
        'Type-Token Ratio (Spoken)': type_token_ratio(spoken_tokens),
        'Average Word Length (Written)': average_word_length(written_tokens),
        'Average Word Length (Spoken)': average_word_length(spoken_tokens)
    },
    'Interactive Features': {
        'Questions Count (Written)': count_questions(written_text),
        'Questions Count (Spoken)': count_questions(spoken_text),
        'Imperative Verbs (Written)': count_imperatives(written_tokens),
        'Imperative Verbs (Spoken)': count_imperatives(spoken_tokens)
    },
    'Speech Markers': {
        'Discourse Markers (Written)': count_discourse_markers(written_tokens),
        'Discourse Markers (Spoken)': count_discourse_markers(spoken_tokens)
    },
    'Formal Vocabulary Use': {
        'Formal Vocabulary (Written)': count_formal_vocabulary(written_tokens),
        'Formal Vocabulary (Spoken)': count_formal_vocabulary(spoken_tokens)
    }
}

# Converting nested dictionary to a DataFrame for better presentation
results_data = []

for factor, sub_factors in analysis_results.items():
    for element, value in sub_factors.items():
        results_data.append({
            "Linguistic Factor": factor,
            "Element": element,
            "Value": value
        })

# Create DataFrame from the analysis results
results_df = pd.DataFrame(results_data)

# Display DataFrame
print(results_df)
# Save the results to CSV
results_df.to_csv('discourse_analysis_refined_results.csv', index=False)



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed pyt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


         Linguistic Factor                        Element      Value
0   Grammatical Complexity        Passive Voice (Written)  16.000000
1   Grammatical Complexity         Passive Voice (Spoken)  30.000000
2   Grammatical Complexity      Sentence Length (Written)  21.032895
3   Grammatical Complexity       Sentence Length (Spoken)  23.750760
4        Lexical Diversity     Type-Token Ratio (Written)   0.493769
5        Lexical Diversity      Type-Token Ratio (Spoken)   0.357695
6        Lexical Diversity  Average Word Length (Written)   7.202374
7        Lexical Diversity   Average Word Length (Spoken)   6.367182
8     Interactive Features      Questions Count (Written)   9.000000
9     Interactive Features       Questions Count (Spoken)  74.000000
10    Interactive Features     Imperative Verbs (Written)  34.000000
11    Interactive Features      Imperative Verbs (Spoken)  95.000000
12          Speech Markers    Discourse Markers (Written)   0.593472
13          Speech Markers     Dis

# New Section