Note: Estimated run time of notebook is 5-10 min.

In [92]:
# psh270, jxs535, fgp424, hkp680

# Task 2
import nltk as nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
nltk.download('all')
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from cleantext.sklearn import CleanTransformer # likely required to ´pip install clean-text´
from cleantext import clean
data = pd.read_csv("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Computer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Computer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading corpus: Package 'corpus' not found in index
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Computer\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Computer\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Computer\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]

In [93]:
type_distribution = {}
for type in data["type"]:
    if type not in type_distribution:
        type_distribution[type] = 1
    else:
        type_distribution[type] += 1
type_distribution

{'unreliable': 6,
 'fake': 155,
 'clickbait': 1,
 'conspiracy': 31,
 'reliable': 3,
 'bias': 6,
 'hate': 1,
 'junksci': 6,
 'political': 23,
 nan: 12,
 'unknown': 6}

In [94]:
def drop_useless_data(data):
    #Dropping unneeded columns
    cols_to_delete = ["Unnamed: 0","id","scraped_at","inserted_at","updated_at"]
    for column in data.columns:
        if data[column].isnull().values.all():
            cols_to_delete.append(column)
    data.drop(cols_to_delete, 1, inplace=True)
    
    #Dropping entries with nan type
    data.dropna(subset = ["type"], inplace = True)
    #Dropping entries with unknown type
    data.drop(data.loc[data["type"] == "unknown"].index, inplace=True)
    not_enough_of_type = ["clickbait", "reliable", "unreliable", "bias", "hate", "junksci"]
    for t in not_enough_of_type:
        data.drop(data.loc[data["type"] == t].index, inplace=True)

drop_useless_data(data)

  data.drop(cols_to_delete, 1, inplace=True)


In [95]:
three_numb_date = r'(<number> <number> <number>)' #YYYY/MM/DD or DD/MM/YYYY or MM/DD/YYYY
literal_months_date= r'(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S* ((<number> ){1,2}|([0-9]{1,2}(st|nd|rd|th)))' #Eg. jun 2nd 2020, january 23. 2021
literal_months_reverse_date = r'((number {1,2})|[0-9]{1,2}(st|nd|rd|th)) *(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S*' #Eg. 10th february, 4th july
all_dates = (three_numb_date) +'|' + (literal_months_date) +'|'+ (literal_months_reverse_date)
multiple_chars = r'(.)\1{3,}'
special_symbols = r'([^<>a-z ])'#Matches special symbols such as © or ™
single_letter = r' [a-z] ' #matches single letters

In [96]:
def get_type_col_distribution(col, func):
    distribution = {}
    for type, val in zip(data["type"], data[col]):
        if type not in distribution:
            distribution[type] = (func(val), 1)
        else:
            cur_val, num_type = distribution[type]
            distribution[type] = cur_val + func(val), num_type + 1
    return distribution

def average_type_col_distrubtion(dist):
    avg_dist = {}
    for key in dist:
        total_val, num_type = dist[key]
        avg_val = total_val / num_type
        avg_dist[key] = avg_val
    return avg_dist

In [97]:
title_len_distribution = get_type_col_distribution("title", len)
avg_title_len_distribution = average_type_col_distrubtion(title_len_distribution)
avg_title_len_distribution

{'fake': 61.33548387096774,
 'conspiracy': 50.61290322580645,
 'political': 61.65217391304348}

In [98]:
content_len_distribution = get_type_col_distribution("content", len)
avg_content_len_distribution = average_type_col_distrubtion(content_len_distribution)
avg_content_len_distribution

{'fake': 4105.096774193548,
 'conspiracy': 4966.580645161291,
 'political': 3992.4347826086955}

In [99]:
has_author_distribution = get_type_col_distribution("authors", lambda authors : 0 if str(authors) == "nan" else 1)
has_author_distribution
average_type_col_distrubtion(has_author_distribution)

{'fake': 0.6709677419354839,
 'conspiracy': 0.967741935483871,
 'political': 0.782608695652174}

In [100]:
def num_sentences(text):
    multi_period = r'\.{2,}'
    #Replacing multiple periods with a single, ie. "..." -> "." 
    text = re.sub(multi_period, ".", text)
    count = 0
    for s in text.split("."):
        if s != "":
            count += 1
    return count

num_sentences_distribution = get_type_col_distribution("content", num_sentences)
average_num_sentences_distribution = average_type_col_distrubtion(num_sentences_distribution)

In [101]:
def word_count(text):
    count = 0
    for s in text.split(" "):
        if s != "":
            count += 1
    return count

In [102]:
def lexical_diversity(text):
    words = text.split(" ")
    num_unique_words = len(set(words))
    return num_unique_words/len(words)

In [103]:
initial_cleaner = CleanTransformer(fix_unicode=True,               # fix various unicode errors
                                    to_ascii=True,                  # transliterate to closest ASCII representation
                                    lower=True,                     # lowercase text
                                    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=True,                  # replace all URLs with a special token
                                    no_emails=True,                # replace all email addresses with a special token
                                    no_phone_numbers=True,         # replace all phone numbers with a special token
                                    no_numbers=False,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=True,      # replace all currency symbols with a special token
                                    no_punct=True,                 # remove punctuations
                                    replace_with_punct=" ",          # instead of removing punctuations you may replace them
                                    replace_with_url="<url>",
                                    replace_with_email="<email>",
                                    replace_with_phone_number="<phone>",
                                    replace_with_currency_symbol="<cur>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )

general_cleaner = CleanTransformer(fix_unicode=False,               # fix various unicode errors
                                    to_ascii=False,                  # transliterate to closest ASCII representation
                                    lower=False,                     # lowercase text
                                    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=False,                  # replace all URLs with a special token
                                    no_emails=False,                # replace all email addresses with a special token
                                    no_phone_numbers=False,         # replace all phone numbers with a special token
                                    no_numbers=True,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=False,      # replace all currency symbols with a special token
                                    no_punct=False,                 # remove punctuations
                                    replace_with_number="<number>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )



In [104]:
def WordFreq(col_name, article_number, input):
    q = input[col_name][article_number]
    unique_words = set(q)
    unique_word_count = len(unique_words)
    qqq = len(q)/unique_word_count
    return qqq

def WordFreqSet(set_value , set_data):
    WordFreqArray = []
    for col in set_data.index:
        WordFreqArray.append(WordFreq(set_value, col, set_data))
    return WordFreqArray

In [105]:
# switch tags, compatibility with lemmatise() 
def switchTag(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif (tag.startswith('J') or
            tag.startswith('A')):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

#string_test='In gold, the open interest SURPRISINGLY ROSE BY A CONSIDERABLE 9126 CONTRACTS UP TO582,421 WITH THE GOOD SIZED RISE IN PRICE OF GOLD WITH YESTERDAY’S TRADING ($5.55). IN ANOTHER HUGE DEVELOPMENT, WE RECEIVED THE TOTAL NUMBER OF GOLD EFP’S ISSUED FOR WEDNESDAY AND IT TOTALED A HUMONGOUS SIZED 12,223 CONTRACTS OF WHICH FEBRUARY SAW 11,023 CONTRACTS ISSUED AND APRIL SAW THE ISSUANCE OF 1200 CONTRACTS.'
#date_test  = '12/18/10 12/18/2020 12-18-10 12-18-2020 12/18/10 12/18/2020 12.18.10 12.18.2020 noise 12182010 december 18, 2010 janu 10th march 1st 3st january Dekjkj 10th  noise 10/20  noise noise 2020 10th january 2021'

def clean_column(data, col_name):
    for i, entry in zip(data[col_name].index, data[col_name]):    
        #We first convert to lower case and replace punctuation with space such that dates can
        #more easily be processed (eg. 10.12.2020 -> 10 12 2020 -> <NUMBER> <NUMBER> <NUMBER> instead of <NUMBER><NUMBER><DIGIT> or something)
        cleaned = initial_cleaner.transform([entry])[0]
        cleaned = general_cleaner.transform([cleaned])[0]
        cleaned = re.sub(all_dates, '<date> ', cleaned)
        cleaned = re.sub(special_symbols,'',cleaned)
        cleaned = re.sub(multiple_chars, '', cleaned)
        cleaned = re.sub(single_letter, '',cleaned)
        data.at[i, col_name] = cleaned

def clean_data(data):
    clean_column(data, "content")
    clean_column(data, "title")


def lemmatise_text(text): 
    buff = word_tokenize(text)
    buff = nltk.pos_tag(buff)
    lemmatizer = WordNetLemmatizer()
    lemmatised = []
    for index, word in enumerate(buff):
        buff[index] = (word[0], switchTag(word[1]))
        lemmatised.append(lemmatizer.lemmatize(buff[index][0],pos=buff[index][1]))
    return lemmatised

def count_and_clean(data, col_name):
    retVal = []
    last_length = 0
    tags = ["<number>","<date>","<url>","<email>","<phone>","<email>"]
    for i, entry in zip(data[col_name].index, data[col_name]):
        cleaned = entry
        arr = []
        last_length = len(entry)
        for tag in tags: 
            cleaned = cleaned.replace(tag, "")
            data.at[i, col_name] = cleaned
            arr.append((last_length-len(cleaned),tag))
            last_length = len(cleaned)
        retVal.append(arr)
    return retVal

In [106]:
# cleaning
clean_data(data)

In [107]:
# clean tags <..>
count = count_and_clean(data, "content")

# CALL WORD COUNT

lexical_diversity_distribution = get_type_col_distribution("content", lexical_diversity)
average_lexical_diversity = average_type_col_distrubtion(lexical_diversity_distribution)
average_lexical_diversity

{'fake': 0.5760302749007344,
 'conspiracy': 0.5625997901851042,
 'political': 0.5915298531509602}

In [108]:
word_count_distribution = get_type_col_distribution("content", word_count)
average_word_count_distribution = average_type_col_distrubtion(word_count_distribution)
average_word_count_distribution
average_word_len_distrubution = {t: avg_content_len_distribution[t]/average_word_count_distribution[t] for t in average_word_count_distribution}
average_word_len_distrubution

{'fake': 6.747078658834009,
 'conspiracy': 6.531370635897002,
 'political': 6.581565366972478}

In [109]:
average_sentence_len_distribution = {t: average_word_count_distribution[t]/average_num_sentences_distribution[t] for t in average_word_count_distribution}
average_sentence_len_distribution

{'fake': 16.435343325200417,
 'conspiracy': 19.211898940505296,
 'political': 16.729016786570742}

In [110]:
def get_complete_text_distribution(col):
    distribution = {}
    for type, text in zip(data["type"], data[col]):
        if type not in distribution:
            distribution[type] = text
        else:
            distribution[type] += " " + text
    return distribution

In [111]:
# Processing before finding frequency distribution:
eng_stopwords = stopwords.words('english')
#Find the frequency distribution for every type, eg. frequency distribution when looking at all fake news articles concatenated 
text_distribution = get_complete_text_distribution("content")

for type in text_distribution:
    # lematise 
    text_distribution[type] = lemmatise_text(text_distribution[type])
    # Delete stop-words
    text_distribution[type] = [word for word in text_distribution[type] if word not in eng_stopwords]
    # create freqDist object 
    text_distribution[type] = FreqDist(text_distribution[type])


In [112]:
for type in text_distribution:
    print(type + ":")
    text_distribution[type].pprint()

fake:
FreqDist({'blockchain': 260, 'market': 258, 'one': 244, 'time': 222, 'trump': 214, 'state': 203, 'say': 199, 'people': 195, 'like': 190, 'year': 188, ...})
conspiracy:
FreqDist({'one': 91, 'government': 75, 'year': 67, 'time': 60, 'american': 55, 'make': 49, 'state': 47, 'use': 46, 'obama': 46, 'people': 45, ...})
political:
FreqDist({'say': 70, 'people': 66, 'go': 52, 'state': 51, 'one': 46, 'thing': 43, 'would': 34, 'take': 32, 'use': 32, 'american': 31, ...})


Task 3
We have the following number of articles of each type:
fake: 155
conspiracy: 31
political: 23
unreliable: 6
junksci: 6
bias: 6
reliable: 3
clickbait: 1
hate: 1

From this we can see that there are not enough representants of classes 'hate', 'clickbait', 'reliable', 'bias', 'junksci', 'unreliable' to make meaningful observations. 
Therefore we chose to not include the listed types in our analysis.

We have observed that conspiracy articles often have authors noted (0.96%) compared to fake news (0.67%) and political (78%). 

We have observed that conspiracy articles tend to be longer.

We have observed that words such as "blockchain", "market", "trump", and "state" are relatively common in fake news. 
For conspiracy common words of interest instead include "government", "american", and "obama".
And for political these include "state" and "american".

We have observed that conspiracy tends to use longer sentences than fake and political (19.2 words on average compared to 16.4 and 16.73 respectively).

We have observed that all fake news articles have been distributed by beforeitsnews.com.

Besides that we have observed that no fake news articles contains meta_keywords or meta_description.

Reliable data seems to have lower word length count per sentence and longer sentence length

TASK 4 

In [113]:
# Initialize Group SubString
group_nr = 14
group_substring_raw = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
group_substring = ""

for letter in np.sort(list(group_substring_raw)):
    group_substring += letter
    
print(group_substring)

AOPRSTUVWZ


In [114]:
# Get main page and add subpage (according to group_substring) urls to list
response = requests.get('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')
contents = response.text

soup = BeautifulSoup(contents, 'html.parser')

subpages = []
for a in soup.find_all('a', href=True):
    for letter in group_substring:
        if "conflicts&from="+letter in a["href"]:
            if not a["href"] in subpages:
                subpages.append(a["href"])  

In [115]:
# Creates a list of article urls to scrape later
Articles = []
for url in subpages:
    response = requests.get(url)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')

    allGroups = soup.find_all("div",attrs={"class":"mw-category-group"})
    for n in allGroups:
        if "<h3>"+url[-1]+"</h3>" in str(n) and "<ul><li><a" in str(n):
            pages = n
            break
    ul = re.findall('\/wiki.*(?=title)',str(pages))
    for i in range(len(ul)):
        ul[i] = "https://en.wikinews.org" + ul[i][:-2]

    Articles.append(ul)

In [116]:
def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

def GA_GetDate(soup):
    try: # Need to convert from "MonthName Day, Year" to "Year-Month-Day"
        date = re.findall('[1-9]+.*[1-9]',str(soup.find("span", attrs={"id":"publishDate"})))[0]
    except:
        try: 
            date = re.findall('[A-Z][a-z]+ [0-9]+, [0-9][0-9][0-9][0-9]',str(soup.find("div", attrs={"class":"mw-parser-output"})))[0]
        except:
            date = "NaN"
    return date

def GA_GetText(soup):
    try:
        text = soup.get_text() #currently displays EVERYTHING on page, needs work
    except:
        text = "NaN"
    return text
        
def GA_GetSources(soup):
    srcs = []
    try:
        src = soup.find_all("span",attrs={"class":"sourceTemplate"})
        for n in src:
            m = remove_tags(str(n))
            srcs.append(m)      
    except:
        src = "NaN"
    return srcs
        
def GA_GetTitle(soup):
    try: 
        title = soup.find("h1",attrs={"id":"firstHeading"})
        title = remove_tags(str(title))
    except:
        title = "NaN"
    return title
        
def GA_GetContent(soup):
    try:
        article_text = ""
        article = soup.find("div",attrs={"class":"mw-parser-output"}).findAll('p')
        for element in article:
            article_text += '\n' + ''.join(element.findAll(text = True))
        test_text = article_text.replace("\n","")
        if test_text == "":
            test_text = re.findall('^.*(?=Have an opinion on this story?)',article_text)
            if test_text == "":
                article_text = "NaN"
            else:
                article_text = test_text
                article_text = re.sub('[A-Z][a-z]*, [A-Z][a-z]* [0-9]*, [0-9]*',"",article_text)
                article_text = re.sub('File:.*\.[a-z][a-z][a-z][a-z]?(?=)',"",article_text)
        else:
            article_text = test_text
            article_text = re.sub('[A-Z][a-z]*, [A-Z][a-z]* [0-9]*, [0-9]*',"",article_text)
            article_text = re.sub('File:.*\.[a-z][a-z][a-z][a-z]?(?=)',"",article_text)
            
    except:
        article_text = "NaN"
    return article_text
        
def GA_GetCategories(soup):
    try:
        categories = []
        cat = soup.find("div",attrs={"class":"mw-normal-catlinks"})
        cat = cat.findAll("ul")[0]
        for c in cat:
            cat = remove_tags(str(c))
            categories.append(cat)
    except:
        categories = "NaN"
    return categories

In [117]:
def GrabArticle(url):
    
    # init soup stuff
    response = requests.get(url)
    contents = response.text
    soup = BeautifulSoup(contents, 'html.parser')
    
    # Will try to find date, date, sources, title, text and categories, if none found/error, will return NaN
    date = GA_GetDate(soup)        
    text = GA_GetText(soup)
    srcs = GA_GetSources(soup)      
    title = GA_GetTitle(soup)
    article_text = GA_GetContent(soup) # Also removes date formatting from top of page
    categories = GA_GetCategories(soup)

    words = article_text.split()
    avg_word = sum(len(word) for word in words) / len(words)
    

    return date, srcs, title, article_text, categories, avg_word

In [118]:
# Scraping article pages for data and adding to lists for dataframe creation
urls,dates,sources,titles,article_text,scraped_at,numberOfWords,categories,avg_words = [],[],[],[],[],[],[],[],[]

for articles in Articles:
    for url in articles:
        now = datetime.now()
        
        d,s,t,at,c,aw = GrabArticle(url)
        
        urls.append(url)
        
        dates.append(d)
        sources.append(s)
        titles.append(t)
        article_text.append(at)
        categories.append(c)
        
        avg_words.append(aw)
        
        
        numberOfWords.append(len(at.split()))
        
        
        scraped_at.append(now.strftime("%d/%m/%Y %H:%M:%S"))
        
        
        
Task4df = pd.DataFrame(data = {"Title" : titles,  "(Raw) No. Words" : numberOfWords, "(Raw) Avg. Word Length" : avg_words, "Date written" : dates, "Content": article_text, "Categories" : categories , "URL" : urls, "Sources" : sources, "Scraped at" : scraped_at})

Task4df

Unnamed: 0,Title,(Raw) No. Words,(Raw) Avg. Word Length,Date written,Content,Categories,URL,Sources,Scraped at
0,A 1-year long strike against FMC Novamed: Wome...,674,5.081602,2007-09-17,"In a free trade zone in Antalya, Turkey, 80 w...","[September 17, 2007, Articles with broken sour...",https://en.wikinews.org/wiki/A_1-year_long_str...,"[Ertuğrul Mavioğlu. ""Serbest bölgede bir yıllı...",04/03/2022 15:40:46
1,A policeman is killed and another one is tortu...,578,5.399654,2005-02-16,On the police officer Luiz Pereira da Silva ...,"[February 16, 2005, Published, Archived, Brazi...",https://en.wikinews.org/wiki/A_policeman_is_ki...,"[Késia Souza. ""SDS reforçará investigação"" — F...",04/03/2022 15:40:47
2,"A timeline: Novak, Rove, Cooper",299,5.113712,2005-07-15,The Novak story that sparked the contoversy w...,"[July 15, 2005, Iraq War, Iraq, Politics and c...",https://en.wikinews.org/wiki/A_timeline:_Novak...,"[David Johnston and Richard W. Stevenson. ""Sou...",04/03/2022 15:40:47
3,Abbas and Olmert meet before Bush visit,471,5.154989,2008-01-08,Israeli Prime Minister Ehud Olmert and Palest...,"[January 8, 2008, Published, Archived, Middle ...",https://en.wikinews.org/wiki/Abbas_and_Olmert_...,"[Jim Teeple. ""Abbas, Olmert Meet Before Bush V...",04/03/2022 15:40:47
4,Abbas fires security chiefs for failure to cur...,245,5.306122,2005-04-02,Palestinian Authority chairman Mahmoud Abbas ...,"[April 2, 2005, Published, Archived, Palestine...",https://en.wikinews.org/wiki/Abbas_fires_secur...,"[Mohammed Assadi. ""Abbas Fires Security Chiefs...",04/03/2022 15:40:47
...,...,...,...,...,...,...,...,...,...
1722,Zimbabwean politician Bennett and four others ...,453,5.165563,2018-01-19,"Roy Bennett, a key Zimbabwean opposition lead...","[January 19, 2018, Published, Archived, Africa...",https://en.wikinews.org/wiki/Zimbabwean_politi...,"[Farai Mutsaka. ""Zimbabwe mourns Bennett, ‘sha...",04/03/2022 15:44:33
1723,Zimbabwean rivals sign power sharing deal,345,5.257971,2008-09-15,Zimbabwean president Robert Mugabe and opposi...,"[September 15, 2008, Audio reports, Zimbabwe, ...",https://en.wikinews.org/wiki/Zimbabwean_rivals...,"[ ""Zimbabwe rivals in historic pact"" — BBC New...",04/03/2022 15:44:33
1724,Zimbabwean unity talks fail,355,5.202817,2008-10-05,Talks between Zimbabwean president Robert Mug...,"[October 5, 2008, Published, Zimbabwe, Politic...",https://en.wikinews.org/wiki/Zimbabwean_unity_...,"[ ""Zimbabwe unity cabinet talks fail"" — BBC Ne...",04/03/2022 15:44:33
1725,Zinaida Greceanii nominated Moldovan Prime Min...,260,5.338462,2008-03-21,President Vladimir Voronin of Moldova today s...,"[March 21, 2008, Published, Archived, Moldova,...",https://en.wikinews.org/wiki/Zinaida_Greceanii...,"[ ""Президент подписал указ о назначении Зинаид...",04/03/2022 15:44:33


We used the following libraries:\
requests, beautifulsoup (bs4), regular expressions (re).

Key issues:\
    - Slow scraping speeds (1700+ articles)\
    - Inconsistent HTML tags and layouts\
    - Inconsistent text formatting (e.g. Dates are written as both November 11, 1111 and 1111-11-11)\
    - The end of articles do not terminate the same, they are padded with wikipedia metadata of indeterminate width and content.